diff --git a/.ci/linux-x64-cpu-gcc.yml b/.ci/linux-x64-cpu-gcc.yml
new file mode 100644
index 00000000000..4f138d9d080
--- /dev/null
+++ b/.ci/linux-x64-cpu-gcc.yml
@@ -0,0 +1,119 @@
+name: linux-x64-cpu-gcc
+on:
+  push:
+    branches: [master]
+    paths:
+    - '.ci/linux-x64-cpu-gcc.yml'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/x86/**'
+    - 'tests/**'
+    - 'tools/**'
+    - '!tools/pnnx/**'
+    - 'examples/**'
+  mr:
+    target-branches: [master]
+    paths:
+    - '.ci/linux-x64-cpu-gcc.yml'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/x86/**'
+    - 'tests/**'
+    - 'tools/**'
+    - '!tools/pnnx/**'
+    - 'examples/**'
+concurrency:
+  group: linux-x64-cpu-gcc-${{ ci.head_ref }}
+
+jobs:
+  linux-gcc:
+    name: linux-gcc
+    strategy:
+      matrix:
+        include:
+          - { SSE2: 'OFF', AVX: 'OFF', AVX2: 'OFF', AVX512: 'OFF' }
+          - { SSE2: 'ON',  AVX: 'OFF', AVX2: 'OFF', AVX512: 'OFF' }
+          - { SSE2: 'ON',  AVX: 'ON',  AVX2: 'OFF', AVX512: 'OFF' }
+          - { SSE2: 'ON',  AVX: 'ON',  AVX2: 'ON',  AVX512: 'OFF' }
+          - { SSE2: 'ON',  AVX: 'ON',  AVX2: 'ON',  AVX512: 'ON'  }
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y libprotobuf-dev protobuf-compiler libopencv-dev
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DNCNN_SSE2=${{matrix.SSE2}} -DNCNN_AVX=${{matrix.AVX}} -DNCNN_AVX2=${{matrix.AVX2}} -DNCNN_AVX512=${{matrix.AVX512}} -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: cd build && ctest --output-on-failure -j $(nproc)
+    - name: build-shared
+      run: |
+        mkdir build-shared && cd build-shared
+        cmake -DNCNN_SSE2=${{matrix.SSE2}} -DNCNN_AVX=${{matrix.AVX}} -DNCNN_AVX2=${{matrix.AVX2}} -DNCNN_AVX512=${{matrix.AVX512}} -DNCNN_SHARED_LIB=ON ..
+        cmake --build . -j $(nproc)
+    - name: build-noint8
+      run: |
+        mkdir build-noint8 && cd build-noint8
+        cmake -DNCNN_SSE2=${{matrix.SSE2}} -DNCNN_AVX=${{matrix.AVX}} -DNCNN_AVX2=${{matrix.AVX2}} -DNCNN_AVX512=${{matrix.AVX512}} -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-noint8
+      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
+
+  linux-gcc-cpp03-nostdio-nostring-simplestl:
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: build-nostdio
+      run: |
+        mkdir build-nostdio && cd build-nostdio
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-nostdio
+      run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
+    - name: build-nostdio-nostring
+      run: |
+        mkdir build-nostdio-nostring && cd build-nostdio-nostring
+        cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: build-simplestl
+      run: |
+        mkdir build-simplestl && cd build-simplestl
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-simplestl
+      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
+    - name: build-simplestl-simpleomp
+      run: |
+        mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-simplestl-simpleomp
+      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
new file mode 100644
index 00000000000..596e753dec1
--- /dev/null
+++ b/.ci/pnnx.yml
@@ -0,0 +1,120 @@
+name: pnnx
+on:
+  push:
+    branches: [master]
+    paths:
+    - '.ci/pnnx.yml'
+    - 'tools/pnnx/**'
+    - '!tools/pnnx/README.md'
+  mr:
+    target-branches: [master]
+    paths:
+    - '.ci/pnnx.yml'
+    - 'tools/pnnx/**'
+    - '!tools/pnnx/README.md'
+concurrency:
+  group: pnnx-${{ ci.head_ref }}
+
+jobs:
+  ubuntu:
+    strategy:
+      matrix:
+        include:
+          - torch-version: 1.8.1
+            torchvision-version: 0.9.1
+            torchvision-cache-key: '0_9_1'
+
+          - torch-version: 1.9.1
+            torchvision-version: 0.10.1
+            torchvision-cache-key: '0_10_1'
+
+          - torch-version: 1.10.0
+            torchvision-version: 0.11.1
+            torchvision-cache-key: '0_11_1'
+
+          - torch-version: 1.11.0
+            torchvision-version: 0.12.0
+            torchvision-cache-key: '0_12_0'
+
+          - torch-version: 1.12.0
+            torchvision-version: 0.13.0
+            torchvision-cache-key: '0_13_0'
+
+          - torch-version: 1.13.0
+            torchvision-version: 0.14.0
+            torchvision-cache-key: '0_14_0'
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y python3-pip libjpeg-dev libpng-dev libprotobuf-dev protobuf-compiler
+        python3 -m pip install --upgrade pip
+        pip3 uninstall -y setuptools
+        pip3 install -U pytest setuptools wheel twine distribute requests
+
+    - name: setup pytorch
+      run: |
+        export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+        pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+    - name: cache-torchvision
+      id: cache-torchvision
+      uses: cache@1.*
+      with:
+        cachePaths: torchvision-${{matrix.torchvision-version}}-install
+        cacheKey: torchvision-${{matrix.torchvision-cache-key}}-linux-install-20211228
+    - name: checkout-torchvision
+      if: steps.cache-torchvision.outputs.cacheHit != 'true'
+      checkout: https://github.com/pytorch/vision.git
+      with:
+        pullType: TAG
+        refName: v${{matrix.torchvision-version}}
+        localPath: vision
+        enableSubmodule: false
+        enableGitLfs: false
+    - name: torchvision
+      if: steps.cache-torchvision.outputs.cacheHit != 'true'
+      run: |
+        cd vision
+        mkdir -p build; cd build
+        cmake -DCMAKE_INSTALL_PREFIX=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install -DTorch_DIR=${{ci.workspace}}/torch-${{matrix.torch-version}}/lib/python3.9/site-packages/torch/share/cmake/Torch -DCMAKE_BUILD_TYPE=Release ..
+        cmake --build . -j $(nproc)
+        cmake --build . --target install
+
+    - name: build-ncnn
+      run: |
+        export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+        cd ..
+        export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
+        pip3 install --user .
+
+    - name: build-pnnx
+      run: |
+        export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+        cd tools/pnnx
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Release -DTorchVision_INSTALL_DIR=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install ..
+        cmake --build . -j $(nproc)
+
+    - name: test
+      run: |
+        export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+        export OMP_NUM_THREADS=1
+        export MKL_NUM_THREADS=1
+        export MKL_ENABLE_INSTRUCTIONS=SSE4_2
+        cd tools/pnnx
+        cd build && ctest --output-on-failure -j 16
diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml
new file mode 100644
index 00000000000..9272caac1ba
--- /dev/null
+++ b/.ci/test-coverage.yml
@@ -0,0 +1,808 @@
+name: test-coverage
+on:
+  push:
+    branches: [master]
+    paths:
+    - '.ci/test-coverage.yml'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/**'
+    - 'tests/**'
+    - 'toolchains/**'
+  mr:
+    target-branches: [master]
+    paths:
+    - '.ci/test-coverage.yml'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/**'
+    - 'tests/**'
+    - 'toolchains/**'
+concurrency:
+  group: test-coverage-${{ ci.head_ref }}
+
+jobs:
+  linux-gcc-gpu:
+    name: linux-gcc-gpu
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov libvulkan-dev
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-swiftshader
+      id: cache-swiftshader
+      uses: cache@1.*
+      with:
+        cachePaths: swiftshader-install
+        cacheKey: swiftshader-linux-install-20221026
+
+    - name: checkout-swiftshader
+      if: steps.cache-swiftshader.outputs.cacheHit != 'true'
+      checkout: https://github.com/google/swiftshader.git
+      with:
+        pullType: COMMIT_ID
+        refName: 04d007924c2d33ea1ac4be78ae423507a0b08b61
+        localPath: swiftshader
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: swiftshader
+      if: steps.cache-swiftshader.outputs.cacheHit != 'true'
+      run: |
+        cd swiftshader
+        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
+        mkdir -p build; cd build
+        cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
+        cmake --build . -j $(nproc)
+        mkdir ${{ci.workspace}}/swiftshader-install
+        cp Linux/* ${{ci.workspace}}/swiftshader-install
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: |
+        printf "[Processor]\nThreadCount=4\n" > build/tests/SwiftShader.ini
+        export VK_ICD_FILENAMES="${{ci.workspace}}/swiftshader-install/vk_swiftshader_icd.json"
+        cd build && ctest --output-on-failure -j 4
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov -r lcov.info '*/glslang/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-gpu-lavapipe:
+    name: linux-gcc-gpu-lavapipe
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov libvulkan-dev libxcb-shm0
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-lavapipe
+      id: cache-lavapipe
+      uses: cache@1.*
+      with:
+        cachePaths: lavapipe-install
+        cacheKey: lavapipe-linux-install-20211127-3
+
+    - name: checkout-lavapipe
+      if: steps.cache-lavapipe.outputs.cacheHit != 'true'
+      checkout: https://github.com/mesa3d/mesa.git
+      with:
+        pullType: COMMIT_ID
+        refName: cd39180cfab20734744b379b085cc3b5c2cecd3a
+        localPath: mesa
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: lavapipe
+      if: steps.cache-lavapipe.outputs.cacheHit != 'true'
+      run: |
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
+        apt-get update
+        apt-get build-dep -y mesa
+        mkdir -p "${{ci.workspace}}/lavapipe-install"
+        cd mesa
+        mkdir build
+        cd build
+        meson -Dprefix="${{ci.workspace}}/lavapipe-install" -Dbuildtype=release -Db_lto=true -Db_ndebug=true -Dplatforms="x11" -Ddri3=enabled -Ddri-drivers="" -Dgallium-drivers=swrast -Dgallium-vdpau=disabled -Dgallium-xvmc=disabled -Dgallium-omx=disabled -Dgallium-va=disabled -Dgallium-xa=disabled -Dgallium-opencl=disabled -Dopencl-native=false -Dvulkan-drivers=swrast -Dshader-cache=disabled -Dgles1=disabled -Dgles2=disabled -Dopengl=false -Dgbm=disabled -Dglx=disabled -Degl=disabled -Dllvm=enabled -Dvalgrind=disabled -Dlibunwind=disabled -Dlmsensors=disabled ..
+        ninja -j$(nproc)
+        ninja install
+        find ${{ci.workspace}}/lavapipe-install
+        cat ${{ci.workspace}}/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: |
+        export LP_NUM_THREADS=4
+        export VK_ICD_FILENAMES="${{ci.workspace}}/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json"
+        cd build
+        ctest --output-on-failure -j 4
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov -r lcov.info '*/glslang/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-x64:
+    name: linux-gcc-x64
+    strategy:
+      matrix:
+        # openmp: ['OFF', 'ON']
+        include:
+          - { SSE2: 'OFF', AVX: 'OFF', XOP: 'OFF', F16C: 'OFF', FMA: 'OFF', AVX2: 'OFF', AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'}
+          - { SSE2: 'ON',  AVX: 'OFF', XOP: 'OFF', F16C: 'OFF', FMA: 'OFF', AVX2: 'OFF', AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'}
+          - { SSE2: 'ON',  AVX: 'ON',  XOP: 'OFF', F16C: 'OFF', FMA: 'OFF', AVX2: 'OFF', AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'}
+          - { SSE2: 'ON',  AVX: 'ON',  XOP: 'OFF', F16C: 'ON',  FMA: 'ON',  AVX2: 'ON',  AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'}
+          - { SSE2: 'ON',  AVX: 'ON',  XOP: 'OFF', F16C: 'ON',  FMA: 'ON',  AVX2: 'ON',  AVX512: 'ON',  AVX512VNNI: 'ON',  AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'}
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON \
+            -DNCNN_SSE2=${{matrix.SSE2}} \
+            -DNCNN_AVX=${{matrix.AVX}} \
+            -DNCNN_XOP=${{matrix.XOP}} \
+            -DNCNN_F16C=${{matrix.F16C}} \
+            -DNCNN_FMA=${{matrix.FMA}} \
+            -DNCNN_AVX2=${{matrix.AVX2}} \
+            -DNCNN_AVX512=${{matrix.AVX512}} \
+            -DNCNN_AVXVNNI=${{matrix.AVXVNNI}} \
+            -DNCNN_AVX512VNNI=${{matrix.AVX512VNNI}} \
+            -DNCNN_AVX512BF16=${{matrix.AVX512BF16}} \
+            -DNCNN_AVX512FP16=${{matrix.AVX512FP16}} \
+            ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: cd build && ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-arm:
+    name: linux-gcc-arm
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov g++-arm-linux-gnueabi g++-arm-linux-gnueabihf libcapstone4 libglib2.0-0
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-qemu
+      id: cache-qemu
+      uses: cache@1.*
+      with:
+        cachePaths: qemu-install
+        cacheKey: qemu-arm-install-20220831
+
+    - name: checkout-qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      checkout: https://github.com/qemu/qemu.git
+      with:
+        pullType: COMMIT_ID
+        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        localPath: qemu
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      run: |
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
+        apt-get update
+        apt-get build-dep -y qemu
+        cd qemu
+        ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=arm-linux-user --disable-system
+        make -j$(nproc)
+        make install
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=ON -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov --list lcov.info
+
+    - name: build-armhf-vfpv3-d16
+      run: |
+        mkdir build-armhf-vfpv3-d16 && cd build-armhf-vfpv3-d16
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-armhf-vfpv3-d16
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build-armhf-vfpv3-d16
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect-armhf-vfpv3-d16
+      run: |
+        cd build-armhf-vfpv3-d16
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build-armhf-vfpv3-d16/*' -o lcov.info
+        lcov --list lcov.info
+
+    - name: codecov
+      run: |
+        ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+        ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build-armhf-vfpv3-d16/lcov.info
+
+  linux-gcc-aarch64:
+    name: linux-gcc-aarch64
+    strategy:
+      matrix:
+        # openmp: ['OFF', 'ON']
+        include:
+          - { ARM82: 'OFF', ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
+          - { ARM82: 'ON',  ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
+          - { ARM82: 'ON',  ARM82DOT: 'ON',  ARM82FP16FML: 'ON',  ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'}
+          - { ARM82: 'ON',  ARM82DOT: 'ON',  ARM82FP16FML: 'ON',  ARM84BF16: 'ON',  ARM84I8MM: 'ON',  ARM86SVE: 'OFF'}
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov g++-aarch64-linux-gnu libcapstone4 libglib2.0-0
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-qemu
+      id: cache-qemu
+      uses: cache@1.*
+      with:
+        cachePaths: qemu-install
+        cacheKey: qemu-aarch64-install-20220831
+
+    - name: checkout-qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      checkout: https://github.com/qemu/qemu.git
+      with:
+        pullType: COMMIT_ID
+        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        localPath: qemu
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      run: |
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
+        apt-get update
+        apt-get build-dep -y qemu
+        cd qemu
+        ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=aarch64-linux-user --disable-system
+        make -j$(nproc)
+        make install
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \
+            -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON \
+            -DNCNN_ARM82=${{matrix.ARM82}} \
+            -DNCNN_ARM82DOT=${{matrix.ARM82DOT}} \
+            -DNCNN_ARM82FP16FML=${{matrix.ARM82FP16FML}} \
+            -DNCNN_ARM84BF16=${{matrix.ARM84BF16}} \
+            -DNCNN_ARM84I8MM=${{matrix.ARM84I8MM}} \
+            ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-mipsisa32r6el:
+    name: linux-gcc-mipsisa32r6el
+    strategy:
+      matrix:
+        OPENMP: ['OFF', 'ON']
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov g++-mipsisa32r6el-linux-gnu libcapstone4 libglib2.0-0
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-qemu
+      id: cache-qemu
+      uses: cache@1.*
+      with:
+        cachePaths: qemu-install
+        cacheKey: qemu-mipsel-install-20220831
+
+    - name: checkout-qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      checkout: https://github.com/qemu/qemu.git
+      with:
+        pullType: COMMIT_ID
+        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        localPath: qemu
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      run: |
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
+        apt-get update
+        apt-get build-dep -y qemu
+        cd qemu
+        ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=mipsel-linux-user --disable-system
+        make -j$(nproc)
+        make install
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=OFF -DNCNN_MMI=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-mipsisa64r6el:
+    name: linux-gcc-mipsisa64r6el
+    strategy:
+      matrix:
+        OPENMP: ['OFF', 'ON']
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov g++-mipsisa64r6el-linux-gnuabi64 libcapstone4 libglib2.0-0
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-qemu
+      id: cache-qemu
+      uses: cache@1.*
+      with:
+        cachePaths: qemu-install
+        cacheKey: qemu-mips64el-install-20220831
+
+    - name: checkout-qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      checkout: https://github.com/qemu/qemu.git
+      with:
+        pullType: COMMIT_ID
+        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        localPath: qemu
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      run: |
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
+        apt-get update
+        apt-get build-dep -y qemu
+        cd qemu
+        ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=mips64el-linux-user --disable-system
+        make -j$(nproc)
+        make install
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=ON -DNCNN_MMI=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-riscv64:
+    name: linux-gcc-riscv64
+    strategy:
+      matrix:
+        OPENMP: ['OFF', 'ON']
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov g++-riscv64-linux-gnu libcapstone4 libglib2.0-0
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-qemu
+      id: cache-qemu
+      uses: cache@1.*
+      with:
+        cachePaths: qemu-install
+        cacheKey: qemu-riscv64-install-20220831
+
+    - name: checkout-qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      checkout: https://github.com/qemu/qemu.git
+      with:
+        pullType: COMMIT_ID
+        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        localPath: qemu
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      run: |
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
+        apt-get update
+        apt-get build-dep -y qemu
+        cd qemu
+        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=riscv64-linux-user --disable-system
+        make -j$(nproc)
+        make install
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-riscv64-rvv:
+    name: linux-gcc-riscv64-rvv
+    strategy:
+      matrix:
+        OPENMP: ['OFF', 'ON']
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov libcapstone4 libglib2.0-0
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: cache-qemu
+      id: cache-qemu
+      uses: cache@1.*
+      with:
+        cachePaths: qemu-install
+        cacheKey: qemu-riscv64-install-20220831
+
+    - name: checkout-qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      checkout: https://github.com/qemu/qemu.git
+      with:
+        pullType: COMMIT_ID
+        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        localPath: qemu
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: qemu
+      if: steps.cache-qemu.outputs.cacheHit != 'true'
+      run: |
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list
+        echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
+        apt-get update
+        apt-get build-dep -y qemu
+        cd qemu
+        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=riscv64-linux-user --disable-system
+        make -j$(nproc)
+        make install
+
+    - name: cache-rv64gcv
+      id: cache-rv64gcv
+      uses: cache@1.*
+      with:
+        cachePaths: rv64gcv-install
+        cacheKey: rv64gcv-linux-install-20221029
+
+    - name: checkout-riscv-gnu-toolchain
+      if: steps.cache-rv64gcv.outputs.cacheHit != 'true'
+      checkout: https://github.com/riscv/riscv-gnu-toolchain.git
+      with:
+        pullType: COMMIT_ID
+        refName: da01ba455ce3802ffa84fdca3a089079996dbfc3
+        localPath: riscv-gnu-toolchain
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: riscv-gnu-toolchain
+      if: steps.cache-rv64gcv.outputs.cacheHit != 'true'
+      run: |
+        apt-get update
+        apt-get install -y autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler
+        cd riscv-gnu-toolchain
+        git submodule update --init --recursive --depth 1 glibc
+        git submodule update --init --recursive --depth 1 newlib
+        #git submodule update --init --recursive --depth 1 riscv-binutils
+        #git submodule update --init --recursive --depth 1 riscv-gcc
+        git submodule update --init --recursive --depth 1 riscv-dejagnu
+        git submodule update --init --recursive --depth 1 riscv-gdb
+        rm -rf riscv-binutils
+        git clone -b binutils-2_39-branch https://sourceware.org/git/binutils-gdb.git riscv-binutils
+        rm -rf riscv-gcc
+        git clone -b riscv-gcc-rvv-next https://github.com/riscv-collab/riscv-gcc.git riscv-gcc
+        cd riscv-gcc
+        git checkout 8a0c1b106f01c455a8fb478cfe52d859a69020fd
+        cd ..
+        sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c
+        ./configure --prefix=${{ci.workspace}}/rv64gcv-install --with-arch=rv64gcv_zfh
+        make linux -j$(nproc)
+        find ${{ci.workspace}}/rv64gcv-install -type f | xargs -i strip -g {} || true
+
+    - name: build
+      run: |
+        export RISCV_ROOT_PATH=${{ci.workspace}}/rv64gcv-install
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_C_FLAGS="-O1" -DCMAKE_CXX_FLAGS="-O1" -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-vlen128
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect-vlen128
+      run: |
+        cd build
+        lcov --gcov-tool ${{ci.workspace}}/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov-vlen128
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+    - name: test-vlen256
+      run: |
+        export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect-vlen256
+      run: |
+        cd build
+        lcov --gcov-tool ${{ci.workspace}}/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov-vlen256
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
diff --git a/.github/workflows/linux-loongarch64-cpu-gcc.yml b/.github/workflows/linux-loongarch64-cpu-gcc.yml
index b2f6bc15f65..cbeef7aa408 100644
--- a/.github/workflows/linux-loongarch64-cpu-gcc.yml
+++ b/.github/workflows/linux-loongarch64-cpu-gcc.yml
@@ -4,23 +4,25 @@ on:
     branches: [master]
     paths:
     - '.github/workflows/linux-loongarch64-cpu-gcc.yml'
+    - 'toolchains/loongarch64-linux-gnu.toolchain.cmake'
     - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake'
     - 'CMakeLists.txt'
     - 'cmake/**'
     - 'src/*'
     - 'src/layer/*'
-    - 'src/layer/loongarch64/**'
+    - 'src/layer/loongarch/**'
     - 'tests/**'
   pull_request:
     branches: [master]
     paths:
     - '.github/workflows/linux-loongarch64-cpu-gcc.yml'
+    - 'toolchains/loongarch64-linux-gnu.toolchain.cmake'
     - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake'
     - 'CMakeLists.txt'
     - 'cmake/**'
     - 'src/*'
     - 'src/layer/*'
-    - 'src/layer/loongarch64/**'
+    - 'src/layer/loongarch/**'
     - 'tests/**'
 concurrency:
   group: linux-loongarch64-cpu-gcc-${{ github.ref }}
@@ -56,6 +58,21 @@ jobs:
       run: |
         export PATH=$GITHUB_WORKSPACE:$PATH
         export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cross-tools/target/usr/lib64:$LD_LIBRARY_PATH
-        export QEMU_STRACE=1
         cd build
         TESTS_EXECUTABLE_LOADER=qemu-loongarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;$GITHUB_WORKSPACE/cross-tools/target/usr" ctest --output-on-failure -j 4
+
+  linux-gcc-loongarch64-lsx:
+    runs-on: [self-hosted, linux, centos]
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: configure
+      run: |
+        export LOONGARCH64_ROOT_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1
+        export LD_LIBRARY_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1/sysroot/usr/lib64:$LD_LIBRARY_PATH
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+
+    - name: build
+      run: cmake --build build -j 4
diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml
index 05e0487cf54..3bb9b363296 100644
--- a/.github/workflows/linux-riscv64-cpu-gcc.yml
+++ b/.github/workflows/linux-riscv64-cpu-gcc.yml
@@ -80,6 +80,19 @@ jobs:
         cd build
         TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2
 
+  linux-gcc-riscv64-c906:
+    runs-on: [self-hosted, linux, centos]
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: configure
+      run: |
+        export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON ..
+    - name: build
+      run: cmake --build build -j 4
+
   linux-gcc-riscv64-rvv:
     runs-on: [self-hosted, linux, centos]
     steps:
@@ -117,7 +130,7 @@ jobs:
       #id: cache-riscv
       #uses: actions/cache@v3
       #with:
-        #path: rv64gcv-install
+        #path: rv64gcv-install-next
         #key: rv64gcv-linux-install-20210504
 
     #- name: install-riscv-build-deps
@@ -132,31 +145,31 @@ jobs:
       #with:
         #repository: riscv/riscv-gnu-toolchain
         #path: riscv-gnu-toolchain
-        #ref: 28271f03bb538d926ad2889dc8ad1b0cb1b3b45c
+        #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3
     #- name: checkout-riscv-gnu-toolchain-submodules
       #if: steps.cache-riscv.outputs.cache-hit != 'true'
       #run: |
         #cd riscv-gnu-toolchain
+        #git submodule update --init --recursive --depth 1 glibc
+        #git submodule update --init --recursive --depth 1 newlib
         #git submodule update --init --recursive --depth 1 riscv-binutils
         #git submodule update --init --recursive --depth 1 riscv-gcc
-        #git submodule update --init --recursive --depth 1 riscv-glibc
         #git submodule update --init --recursive --depth 1 riscv-dejagnu
-        #git submodule update --init --recursive --depth 1 riscv-newlib
         #git submodule update --init --recursive --depth 1 riscv-gdb
     #- name: riscv-gnu-toolchain
       #if: steps.cache-riscv.outputs.cache-hit != 'true'
       #run: |
         #cd riscv-gnu-toolchain
-        #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c
-        #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install --with-arch=rv64gcv_zfh
+        #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c
+        #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh
         #make linux
 
     #- name: riscv-strip-install
       #if: steps.cache-riscv.outputs.cache-hit != 'true'
-      #run: find $GITHUB_WORKSPACE/rv64gcv-install -type f | xargs -i strip -g {} || true
+      #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true
 
     - name: configure
-      run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+      run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
       run: cmake --build build -j 4
 
@@ -164,10 +177,10 @@ jobs:
       run: |
         export PATH=/data/action/osd/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
 
     - name: test-vlen128
       run: |
         export PATH=/data/action/osd/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
diff --git a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml
new file mode 100644
index 00000000000..18ad114efa4
--- /dev/null
+++ b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml
@@ -0,0 +1,142 @@
+name: linux-riscv64-cpu-gnu-clang
+on:
+  push:
+    branches: [master]
+    paths:
+    - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml'
+    - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/riscv/**'
+    - 'tests/**'
+  pull_request:
+    branches: [master]
+    paths:
+    - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml'
+    - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/riscv/**'
+    - 'tests/**'
+concurrency:
+  group: linux-riscv64-cpu-gnu-clang-${{ github.ref }}
+  cancel-in-progress: true
+permissions:
+  contents: read
+
+jobs:
+  linux-gcc-riscv64-rvv:
+    runs-on: [self-hosted, linux, centos]
+    steps:
+    - uses: actions/checkout@v3
+
+    #- name: cache-qemu
+      #id: cache-qemu
+      #uses: actions/cache@v3
+      #with:
+        #path: qemu-install
+        #key: qemu-riscv64-install-20220502-3
+    #- name: install-qemu-build-deps
+      #if: steps.cache-qemu.outputs.cache-hit != 'true'
+      #run: |
+        #sudo apt-get update
+        #sudo apt-get install autoconf automake autotools-dev ninja-build
+    #- name: checkout-qemu
+      #if: steps.cache-qemu.outputs.cache-hit != 'true'
+      #uses: actions/checkout@v3
+      #with:
+        #repository: qemu/qemu
+        #path: qemu
+        #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
+    #- name: qemu
+      #if: steps.cache-qemu.outputs.cache-hit != 'true'
+      #run: |
+        #cd qemu
+        #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
+        #make -j2
+        #make install
+
+    #- name: cache-riscv
+      #id: cache-riscv
+      #uses: actions/cache@v3
+      #with:
+        #path: rv64gcv-install-next
+        #key: rv64gcv-linux-install-20210504
+
+    #- name: install-riscv-build-deps
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: |
+        #sudo apt-get update
+        #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler
+
+    #- name: checkout-riscv-gnu-toolchain
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #uses: actions/checkout@v3
+      #with:
+        #repository: riscv/riscv-gnu-toolchain
+        #path: riscv-gnu-toolchain
+        #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3
+    #- name: checkout-riscv-gnu-toolchain-submodules
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: |
+        #cd riscv-gnu-toolchain
+        #git submodule update --init --recursive --depth 1 glibc
+        #git submodule update --init --recursive --depth 1 newlib
+        #git submodule update --init --recursive --depth 1 riscv-binutils
+        #git submodule update --init --recursive --depth 1 riscv-gcc
+        #git submodule update --init --recursive --depth 1 riscv-dejagnu
+        #git submodule update --init --recursive --depth 1 riscv-gdb
+    #- name: riscv-gnu-toolchain
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: |
+        #cd riscv-gnu-toolchain
+        #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c
+        #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh
+        #make linux
+
+    #- name: riscv-strip-install
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true
+
+    # - name: install-clang
+    #   run: |
+    #     wget https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/llvm-project-15.0.1.src.tar.xz
+    #     tar -xf llvm-project-15.0.1.src.tar.xz
+    #     cd llvm-project-15.0.1.src
+    #     mkdir build
+    #     cd build
+    #     cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/
+    #     make -j16
+    #     make install
+
+    - name: build
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next
+        export PATH=/data/action/osd/llvm-project-15.0.1.src/build/install/bin:$PATH
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j 4
+
+    - name: test-vlen256
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        export PATH=/data/action/osd/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
+
+    - name: test-vlen128
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        export PATH=/data/action/osd/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
diff --git a/.github/workflows/pnnx.yml b/.github/workflows/pnnx.yml
deleted file mode 100644
index 976f9a706d4..00000000000
--- a/.github/workflows/pnnx.yml
+++ /dev/null
@@ -1,102 +0,0 @@
-name: pnnx
-on:
-  push:
-    branches: [master]
-    paths:
-    - '.github/workflows/pnnx.yml'
-    - 'tools/pnnx/**'
-    - '!tools/pnnx/README.md'
-  pull_request:
-    branches: [master]
-    paths:
-    - '.github/workflows/pnnx.yml'
-    - 'tools/pnnx/**'
-    - '!tools/pnnx/README.md'
-concurrency:
-  group: pnnx-${{ github.ref }}
-  cancel-in-progress: true
-permissions:
-  contents: read
-
-jobs:
-  ubuntu:
-    runs-on: [self-hosted, linux, centos]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - torch-version: 1.8.1
-            torchvision-version: 0.9.1
-
-          - torch-version: 1.9.1
-            torchvision-version: 0.10.1
-
-          - torch-version: 1.10.0
-            torchvision-version: 0.11.1
-
-          - torch-version: 1.11.0
-            torchvision-version: 0.12.0
-
-          - torch-version: 1.12.0
-            torchvision-version: 0.13.0
-
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: true
-
-    - name: setup pytorch-${{ matrix.torch-version }}
-      run: |
-        export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }}
-        pip install --user torch==${{ matrix.torch-version }}+cpu torchvision==${{ matrix.torchvision-version }}+cpu -f https://download.pytorch.org/whl/torch_stable.html
-
-    - name: cache-torchvision-${{ matrix.torchvision-version }}
-      id: cache-torchvision
-      uses: actions/cache@v3
-      with:
-        path: torchvision-${{ matrix.torchvision-version }}-install
-        key: torchvision-${{ matrix.torchvision-version }}-linux-install-20211228
-    - name: checkout-torchvision-${{ matrix.torchvision-version }}
-      if: steps.cache-torchvision.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: pytorch/vision
-        path: vision
-        ref: v${{ matrix.torchvision-version }}
-    - name: torchvision-${{ matrix.torchvision-version }}
-      if: steps.cache-torchvision.outputs.cache-hit != 'true'
-      run: |
-        cd vision
-        mkdir -p build; cd build
-        cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/torchvision-${{ matrix.torchvision-version }}-install -DTorch_DIR=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }}/lib/python3.8/site-packages/torch/share/cmake/Torch -DCMAKE_BUILD_TYPE=Release ..
-        cmake --build . -j 4
-        cmake --build . --target install
-
-    - name: build-ncnn
-      run: |
-        export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }}
-        pip install --user pytest setuptools wheel twine
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 4
-        cd ..
-        pip install --user .
-
-    - name: build-pnnx
-      run: |
-        export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }}
-        cd tools/pnnx
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DTorchVision_INSTALL_DIR=$GITHUB_WORKSPACE/torchvision-${{ matrix.torchvision-version }}-install ..
-        cmake --build . -j 4
-
-    - name: test
-      run: |
-        export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }}
-        export OMP_NUM_THREADS=1
-        export MKL_NUM_THREADS=1
-        export MKL_ENABLE_INSTRUCTIONS=SSE4_2
-        pip install --upgrade requests
-        cd tools/pnnx
-        cd build && ctest --output-on-failure -j 4
diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index abbf1cccc9d..25c75230b9e 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -1,4 +1,5 @@
 name: release-python
+# on: [push, pull_request]
 on:
   push:
     tags:
@@ -31,12 +32,26 @@ jobs:
         path: dist/*.tar.gz
 
   build_wheels:
-    name: Build wheels on ${{ matrix.os }}
+    name: ${{ matrix.arch }} ${{ matrix.build }} on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, windows-2019, macos-11]
+        include:
+          - { os: ubuntu-20.04, arch: x86_64,     build: 'cp*-manylinux*' }
+          - { os: ubuntu-20.04, arch: x86_64,     build: 'cp*-musllinux*' }
+          - { os: ubuntu-20.04, arch: x86_64,     build: 'pp*'            }
+          - { os: ubuntu-20.04, arch: i686,       build: 'cp*-manylinux*' }
+          - { os: ubuntu-20.04, arch: i686,       build: 'cp*-musllinux*' }
+          - { os: ubuntu-20.04, arch: i686,       build: 'pp*'            }
+          - { os: windows-2019, arch: x86,        build: 'cp*'            }
+          - { os: windows-2019, arch: AMD64,      build: 'cp*'            }
+          - { os: windows-2019, arch: AMD64,      build: 'pp*'            }
+          - { os: windows-2019, arch: ARM64,      build: 'cp*'            }
+          - { os: macos-11,     arch: x86_64,     build: 'cp*'            }
+          - { os: macos-11,     arch: x86_64,     build: 'pp*'            }
+          - { os: macos-11,     arch: arm64,      build: 'cp*'            }
+          - { os: macos-11,     arch: universal2, build: 'cp*'            }
 
     steps:
     - uses: actions/checkout@v3
@@ -53,10 +68,14 @@ jobs:
         brew uninstall --ignore-dependencies libomp
 
     - name: Build wheels
-      uses: pypa/cibuildwheel@v2.9.0
+      uses: pypa/cibuildwheel@v2.11.2
       env:
-        CIBW_ARCHS_MACOS: x86_64 universal2 arm64
-        CIBW_ARCHS_LINUX: x86_64 i686
+        CIBW_ARCHS_MACOS: ${{ matrix.arch }}
+        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
+        CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
+        CIBW_BUILD: ${{ matrix.build }}
+        CIBW_BUILD_VERBOSITY: 1
+        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
 
     - name: Show files
       run: ls -lh wheelhouse
@@ -72,21 +91,21 @@ jobs:
         path: wheelhouse/*.whl
 
   build_wheels_qemu:
-    name: Build wheels ${{ matrix.arch }} ${{ matrix.build }}
+    name: ${{ matrix.arch }} ${{ matrix.build }}
     runs-on: ubuntu-20.04
 
     strategy:
       fail-fast: false
       matrix:
         arch: [aarch64, ppc64le, s390x]
-        build: ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "cp310-*"]
+        build: ['cp36-*', 'cp37-*', 'cp38-*', 'cp39-*', 'cp310-*', 'cp311-*']
         include:
           - arch: aarch64
-            build: "pp37-*"
+            build: 'pp37-*'
           - arch: aarch64
-            build: "pp38-*"
+            build: 'pp38-*'
           - arch: aarch64
-            build: "pp39-*"
+            build: 'pp39-*'
 
     steps:
     - uses: actions/checkout@v3
@@ -103,10 +122,12 @@ jobs:
         platforms: all
 
     - name: Build wheels
-      uses: pypa/cibuildwheel@v2.9.0
+      uses: pypa/cibuildwheel@v2.11.2
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
+        CIBW_BUILD_VERBOSITY: 1
+        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
 
     - name: Show files
       run: ls -lh wheelhouse
@@ -138,7 +159,7 @@ jobs:
         name: artifact
         path: dist
 
-    - uses: pypa/gh-action-pypi-publish@v1.5.1
+    - uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 9add249d239..d1762616790 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -22,7 +22,7 @@ jobs:
     steps:
     - name: get-version
       id: get_version
-      run: echo ::set-output name=VERSION::${GITHUB_REF/refs\/tags\//}
+      run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_OUTPUT
 
   full-source:
     needs: [setup]
@@ -237,7 +237,7 @@ jobs:
         path: ${{ env.PACKAGENAME }}.zip
 
   openmp-macos:
-    runs-on: macos-latest
+    runs-on: macos-11
     steps:
     - name: cache-openmp
       id: cache-openmp
@@ -290,7 +290,7 @@ jobs:
 
   macos:
     needs: [setup, openmp-macos]
-    runs-on: macos-latest
+    runs-on: macos-11
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos
     steps:
@@ -358,7 +358,7 @@ jobs:
 
   macos-gpu:
     needs: [setup, openmp-macos]
-    runs-on: macos-latest
+    runs-on: macos-11
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan
     steps:
@@ -454,7 +454,7 @@ jobs:
         path: ${{ env.PACKAGENAME }}.zip
 
   openmp-ios:
-    runs-on: macos-latest
+    runs-on: macos-11
     steps:
     - name: cache-openmp
       id: cache-openmp
@@ -511,7 +511,7 @@ jobs:
 
   ios:
     needs: [setup, openmp-ios]
-    runs-on: macos-latest
+    runs-on: macos-11
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios
     steps:
@@ -594,7 +594,7 @@ jobs:
 
   ios-gpu:
     needs: [setup, openmp-ios]
-    runs-on: macos-latest
+    runs-on: macos-11
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan
     steps:
@@ -693,7 +693,7 @@ jobs:
         path: ${{ env.PACKAGENAME }}.zip
 
   openmp-ios-bitcode:
-    runs-on: macos-latest
+    runs-on: macos-11
     steps:
     - name: cache-openmp
       id: cache-openmp
@@ -750,7 +750,7 @@ jobs:
 
   ios-bitcode:
     needs: [setup, openmp-ios-bitcode]
-    runs-on: macos-latest
+    runs-on: macos-11
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-bitcode
     steps:
@@ -833,7 +833,7 @@ jobs:
 
   ios-gpu-bitcode:
     needs: [setup, openmp-ios-bitcode]
-    runs-on: macos-latest
+    runs-on: macos-11
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan-bitcode
     steps:
@@ -1267,6 +1267,7 @@ jobs:
     runs-on: windows-2019
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
@@ -1332,6 +1333,7 @@ jobs:
     runs-on: windows-2019
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
@@ -1397,6 +1399,7 @@ jobs:
     runs-on: windows-2019
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
@@ -1462,6 +1465,7 @@ jobs:
     runs-on: windows-2019
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
@@ -1527,6 +1531,7 @@ jobs:
     runs-on: windows-latest
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
@@ -1608,6 +1613,7 @@ jobs:
     runs-on: windows-latest
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
@@ -1689,6 +1695,7 @@ jobs:
     runs-on: windows-latest
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
@@ -1770,6 +1777,7 @@ jobs:
     runs-on: windows-latest
     env:
       PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml
index f84eeb25222..87401acd00f 100644
--- a/.github/workflows/test-coverage.yml
+++ b/.github/workflows/test-coverage.yml
@@ -25,112 +25,6 @@ permissions:
   contents: read
 
 jobs:
-  linux-gcc-gpu:
-    runs-on: [self-hosted, linux, cvm]
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: true
-    - name: cache-swiftshader
-      id: cache-swiftshader
-      uses: actions/cache@v3
-      with:
-        path: swiftshader-install
-        key: swiftshader-linux-install-20220211
-    - name: checkout-swiftshader
-      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: google/swiftshader
-        path: swiftshader
-        ref: 0863290dc7f6cc1649bab1858790e812b8aef02a
-    - name: checkout-swiftshader-submodules
-      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
-      run: |
-        cd swiftshader
-        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
-    - name: swiftshader
-      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
-      run: |
-        cd swiftshader
-        mkdir -p build; cd build
-        cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
-        cmake --build . -j 4
-        mkdir $GITHUB_WORKSPACE/swiftshader-install
-        cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 4
-    - name: test
-      run: |
-        printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini
-        export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
-        cd build && ctest --output-on-failure -j 4
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov -r lcov.info '*/glslang/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      id: codecov
-      continue-on-error: true
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-1
-      continue-on-error: true
-      id: codecov-vlen256-retry-1
-      if: steps.codecov.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-2
-      continue-on-error: true
-      id: codecov-vlen256-retry-2
-      if: steps.codecov-vlen256-retry-1.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-3
-      continue-on-error: true
-      id: codecov-vlen256-retry-3
-      if: steps.codecov-vlen256-retry-2.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-4
-      continue-on-error: true
-      id: codecov-vlen256-retry-4
-      if: steps.codecov-vlen256-retry-3.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-5
-      continue-on-error: true
-      id: codecov-vlen256-retry-5
-      if: steps.codecov-vlen256-retry-4.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: set the status
-      if: always()
-      run: |
-        if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then
-           echo fine
-        else
-           exit 1
-        fi
-
   linux-gcc-gpu-t4:
     runs-on: [self-hosted, linux, t4]
     steps:
@@ -215,1144 +109,39 @@ jobs:
            exit 1
         fi
 
-  linux-gcc-gpu-lavapipe:
-    runs-on: [self-hosted, linux, cvm]
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: true
-
-    - name: cache-lavapipe
-      id: cache-lavapipe
-      uses: actions/cache@v3
-      with:
-        path: lavapipe-install
-        key: lavapipe-linux-install-20211127-2
-    - name: checkout-lavapipe
-      if: steps.cache-lavapipe.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: mesa3d/mesa
-        path: mesa
-        ref: cd39180cfab20734744b379b085cc3b5c2cecd3a
-    - name: lavapipe
-      if: steps.cache-lavapipe.outputs.cache-hit != 'true'
-      run: |
-        mkdir -p "$GITHUB_WORKSPACE/lavapipe-install"
-        cd mesa
-        mkdir build
-        cd build
-        meson -Dprefix="$GITHUB_WORKSPACE/lavapipe-install" -Dbuildtype=release -Db_lto=true -Db_ndebug=true -Dplatforms="x11" -Ddri3=enabled -Ddri-drivers="" -Dgallium-drivers=swrast -Dgallium-vdpau=disabled -Dgallium-xvmc=disabled -Dgallium-omx=disabled -Dgallium-va=disabled -Dgallium-xa=disabled -Dgallium-opencl=disabled -Dopencl-native=false -Dvulkan-drivers=swrast -Dshader-cache=disabled -Dgles1=disabled -Dgles2=disabled -Dopengl=false -Dgbm=disabled -Dglx=disabled -Degl=disabled -Dllvm=enabled -Dvalgrind=disabled -Dlibunwind=disabled -Dlmsensors=disabled ..
-        ninja -j4
-        ninja install
-        sed -ie "s@$GITHUB_WORKSPACE/lavapipe-install/lib/x86_64-linux-gnu/libvulkan_lvp.so@../../../lib/x86_64-linux-gnu/libvulkan_lvp.so@g" $GITHUB_WORKSPACE/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json
-
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 4
-    - name: test
-      run: |
-        export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json"
-        cd build && ctest --output-on-failure -j 4
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov -r lcov.info '*/glslang/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      id: codecov
-      continue-on-error: true
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-1
-      continue-on-error: true
-      id: codecov-vlen256-retry-1
-      if: steps.codecov.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-2
-      continue-on-error: true
-      id: codecov-vlen256-retry-2
-      if: steps.codecov-vlen256-retry-1.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-3
-      continue-on-error: true
-      id: codecov-vlen256-retry-3
-      if: steps.codecov-vlen256-retry-2.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-4
-      continue-on-error: true
-      id: codecov-vlen256-retry-4
-      if: steps.codecov-vlen256-retry-3.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-5
-      continue-on-error: true
-      id: codecov-vlen256-retry-5
-      if: steps.codecov-vlen256-retry-4.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: set the status
-      if: always()
-      run: |
-        if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then
-           echo fine
-        else
-           exit 1
-        fi
-
-  linux-gcc-x64:
-    runs-on: ubuntu-latest
+  linux-gcc-x64-avx512-spr:
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v3
+    - name: update
+      run: sudo apt-get update
+    - name: gcc12
+      run: sudo apt-get install gcc-12 g++-12
     - name: lcov
       run: sudo apt-get install lcov
-    - name: build-sse2
-      run: |
-        mkdir build-sse2 && cd build-sse2
-        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
-    - name: test-sse2
-      run: cd build-sse2 && ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-sse2
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-sse2/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov-sse2
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build-sse2/lcov.info
-
-    - name: build-avx
-      run: |
-        mkdir build-avx && cd build-avx
-        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
-    - name: test-avx
-      run: cd build-avx && ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-avx
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-avx/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov-avx
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build-avx/lcov.info
-
-    - name: build-avx2
+    - name: Setup SDE binaries
+      uses: petarpetrovt/setup-sde@v2
+    - name: build-avx512-spr
+      env:
+        CC: gcc-12
+        CXX: g++-12
       run: |
-        mkdir build-avx2 && cd build-avx2
-        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        mkdir build-avx512-spr && cd build-avx512-spr
+        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
         cmake --build . -j 2
-    - name: test-avx2
-      run: cd build-avx2 && ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-avx2
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-avx2/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov-avx2
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build-avx2/lcov.info
-
-  linux-gcc-x64-avx512:
-    runs-on: [self-hosted, linux, t4]
-    steps:
-    - uses: actions/checkout@v3
-    - name: build
-      env:
-        CC: gcc
-        CXX: g++
-        LD_LIBRARY_PATH: /data/action/install/lib64
+    - name: test-avx512-spr
       run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 4
-    - name: test
-      env:
-        LD_LIBRARY_PATH: /data/action/install/lib64
-      run: cd build && ctest --output-on-failure -j 4
+        cd build-avx512-spr
+        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2
     - name: lcov-collect
       run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
+        cd build-avx512-spr
+        lcov --gcov-tool gcov-12 -d ./src -c -o lcov.info
         lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/install/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov -r lcov.info '*/build-avx512-spr/*' -o lcov.info
         lcov --list lcov.info
-    - name: codecov
-      id: codecov
-      continue-on-error: true
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-1
-      continue-on-error: true
-      id: codecov-vlen256-retry-1
-      if: steps.codecov.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-2
-      continue-on-error: true
-      id: codecov-vlen256-retry-2
-      if: steps.codecov-vlen256-retry-1.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-3
-      continue-on-error: true
-      id: codecov-vlen256-retry-3
-      if: steps.codecov-vlen256-retry-2.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-4
-      continue-on-error: true
-      id: codecov-vlen256-retry-4
-      if: steps.codecov-vlen256-retry-3.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-5
-      continue-on-error: true
-      id: codecov-vlen256-retry-5
-      if: steps.codecov-vlen256-retry-4.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: set the status
-      if: always()
-      run: |
-        if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then
-           echo fine
-        else
-           exit 1
-        fi
-
-  linux-gcc-x64-avx512-spr:
-    runs-on: ubuntu-22.04
-    steps:
-    - uses: actions/checkout@v3
-    - name: update
-      run: sudo apt-get update
-    - name: gcc12
-      run: sudo apt-get install gcc-12 g++-12
-    - name: lcov
-      run: sudo apt-get install lcov
-    - name: Setup SDE binaries
-      uses: petarpetrovt/setup-sde@v2
-    - name: build-avx512-spr
-      env:
-        CC: gcc-12
-        CXX: g++-12
-      run: |
-        mkdir build-avx512-spr && cd build-avx512-spr
-        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
-    - name: test-avx512-spr
-      run: |
-        cd build-avx512-spr
-        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-avx512-spr
-        lcov --gcov-tool gcov-12 -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-avx512-spr/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov-avx512-spr
+    - name: codecov-avx512-spr
       uses: codecov/codecov-action@v3
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
         file: build-avx512-spr/lcov.info
-
-  linux-gcc-armhf-vfpv3-d16:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-    - name: lcov
-      run: sudo apt-get install lcov
-
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-arm-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: arm-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-arm-linux-gnueabihf
-
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 2
-
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
-
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-
-  linux-gcc-arm:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-    - name: lcov
-      run: sudo apt-get install lcov
-
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-arm-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: arm-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-arm-linux-gnueabi
-
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=ON -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 2
-
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2
-
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-
-  linux-gcc-aarch64:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-    - name: lcov
-      run: sudo apt-get install lcov
-
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-aarch64-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: aarch64-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-aarch64-linux-gnu
-
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 2
-
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
-
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-
-  linux-gcc-arm82:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: lcov
-      run: sudo apt-get install lcov
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-aarch64-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: aarch64-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-aarch64-linux-gnu
-
-    - name: build-arm82
-      run: |
-        mkdir build-arm82 && cd build-arm82
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_ARM82=ON -DNCNN_ARM82FP16FML=OFF -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build-arm82
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-arm82
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-arm82/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build-arm82/lcov.info
-
-  linux-gcc-arm82-omp:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: lcov
-      run: sudo apt-get install lcov
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-aarch64-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: aarch64-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-aarch64-linux-gnu
-
-    - name: build-arm82-omp
-      run: |
-        mkdir build-arm82-omp && cd build-arm82-omp
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_ARM82=ON -DNCNN_ARM82FP16FML=OFF -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build-arm82-omp
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-arm82-omp
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-arm82-omp/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build-arm82-omp/lcov.info
-
-  linux-gcc-arm82dot-omp:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: lcov
-      run: sudo apt-get install lcov
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-aarch64-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: aarch64-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-aarch64-linux-gnu
-
-    - name: build-arm82dot-omp
-      run: |
-        mkdir build-arm82dot-omp && cd build-arm82dot-omp
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=ON -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_ARM82FP16FML=OFF -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build-arm82dot-omp
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-arm82dot-omp
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-arm82dot-omp/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build-arm82dot-omp/lcov.info
-
-  linux-gcc-arm84:
-    runs-on: ubuntu-22.04
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: lcov
-      run: sudo apt-get install lcov
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-aarch64-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: aarch64-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-aarch64-linux-gnu
-
-    - name: build-arm84
-      run: |
-        mkdir build-arm84 && cd build-arm84
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=ON -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_ARM82FP16FML=ON -DNCNN_ARM84BF16=ON -DNCNN_ARM84I8MM=ON -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build-arm84
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
-    - name: lcov-collect
-      run: |
-        cd build-arm84
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build-arm84/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build-arm84/lcov.info
-
-  linux-gcc-mipsisa32r6el:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: lcov
-      run: sudo apt-get install lcov
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-mipsel-install-20220502
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: mipsisa32r6el-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-mipsisa32r6el-linux-gnu
-
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=OFF -DNCNN_MMI=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 2
-
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j 2
-
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-
-  linux-gcc-mipsisa64r6el:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: lcov
-      run: sudo apt-get install lcov
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-mips64el-install-20220502-3
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0001-target-mips-Fix-SAT_S-trans-helper.patch
-        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch
-        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch
-        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch
-        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch
-        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch
-        patch -p1 -i 0001-target-mips-Fix-SAT_S-trans-helper.patch
-        patch -p1 -i 0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch
-        patch -p1 -i 0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch
-        patch -p1 -i 0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch
-        patch -p1 -i 0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch
-        patch -p1 -i 0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: mipsisa64r6el-gnuabi64-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-mipsisa64r6el-linux-gnuabi64
-
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=ON -DNCNN_MMI=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 2
-
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j 2
-
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-
-  linux-gcc-riscv64:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: lcov
-      run: sudo apt-get install lcov
-
-    - name: cache-qemu
-      id: cache-qemu
-      uses: actions/cache@v3
-      with:
-        path: qemu-install
-        key: qemu-riscv64-install-20220502-3
-    - name: install-qemu-build-deps
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        sudo apt-get update
-        sudo apt-get install autoconf automake autotools-dev ninja-build
-    - name: checkout-qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
-      with:
-        repository: qemu/qemu
-        path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    - name: qemu
-      if: steps.cache-qemu.outputs.cache-hit != 'true'
-      run: |
-        cd qemu
-        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
-        patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
-        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
-        make -j2
-        make install
-
-    - name: riscv64-gnu-toolchain
-      run: |
-        sudo apt-get update
-        sudo apt-get install g++-riscv64-linux-gnu
-
-    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 2
-
-    - name: test
-      run: |
-        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2
-
-    - name: lcov-collect
-      run: |
-        cd build
-        lcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-
-  linux-gcc-riscv64-rvv:
-    runs-on: [self-hosted, linux, centos]
-    steps:
-    - uses: actions/checkout@v3
-
-    #- name: lcov
-      #run: sudo apt-get install lcov
-
-    #- name: cache-qemu
-      #id: cache-qemu
-      #uses: actions/cache@v3
-      #with:
-        #path: qemu-install
-        #key: qemu-riscv64-install-20220502-3
-    #- name: install-qemu-build-deps
-      #if: steps.cache-qemu.outputs.cache-hit != 'true'
-      #run: |
-        #sudo apt-get update
-        #sudo apt-get install autoconf automake autotools-dev ninja-build
-    #- name: checkout-qemu
-      #if: steps.cache-qemu.outputs.cache-hit != 'true'
-      #uses: actions/checkout@v3
-      #with:
-        #repository: qemu/qemu
-        #path: qemu
-        #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
-    #- name: qemu
-      #if: steps.cache-qemu.outputs.cache-hit != 'true'
-      #run: |
-        #cd qemu
-        #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
-        #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
-        #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
-        #make -j2
-        #make install
-
-    #- name: cache-riscv
-      #id: cache-riscv
-      #uses: actions/cache@v3
-      #with:
-        #path: rv64gcv-install
-        #key: rv64gcv-linux-install-20210504
-
-    #- name: install-riscv-build-deps
-      #if: steps.cache-riscv.outputs.cache-hit != 'true'
-      #run: |
-        #sudo apt-get update
-        #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler
-
-    #- name: checkout-riscv-gnu-toolchain
-      #if: steps.cache-riscv.outputs.cache-hit != 'true'
-      #uses: actions/checkout@v3
-      #with:
-        #repository: riscv/riscv-gnu-toolchain
-        #path: riscv-gnu-toolchain
-        #ref: 28271f03bb538d926ad2889dc8ad1b0cb1b3b45c
-    #- name: checkout-riscv-gnu-toolchain-submodules
-      #if: steps.cache-riscv.outputs.cache-hit != 'true'
-      #run: |
-        #cd riscv-gnu-toolchain
-        #git submodule update --init --recursive --depth 1 riscv-binutils
-        #git submodule update --init --recursive --depth 1 riscv-gcc
-        #git submodule update --init --recursive --depth 1 riscv-glibc
-        #git submodule update --init --recursive --depth 1 riscv-dejagnu
-        #git submodule update --init --recursive --depth 1 riscv-newlib
-        #git submodule update --init --recursive --depth 1 riscv-gdb
-    #- name: riscv-gnu-toolchain
-      #if: steps.cache-riscv.outputs.cache-hit != 'true'
-      #run: |
-        #cd riscv-gnu-toolchain
-        #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c
-        #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install --with-arch=rv64gcv_zfh
-        #make linux
-
-    #- name: riscv-strip-install
-      #if: steps.cache-riscv.outputs.cache-hit != 'true'
-      #run: find $GITHUB_WORKSPACE/rv64gcv-install -type f | xargs -i strip -g {} || true
-
-    - name: configure
-      run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_C_FLAGS="-O1" -DCMAKE_CXX_FLAGS="-O1" -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-    - name: build
-      run: cmake --build build -j 4
-
-    - name: test-vlen128
-      run: |
-        export PATH=/data/action/osd/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
-
-    - name: lcov-collect-vlen128
-      run: |
-        cd build
-        lcov --gcov-tool /data/action/osd/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov-vlen128
-      id: codecov-vlen128
-      continue-on-error: true
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen128-retry-1
-      continue-on-error: true
-      id: codecov-vlen128-retry-1
-      if: steps.codecov-vlen128.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen128-retry-2
-      continue-on-error: true
-      id: codecov-vlen128-retry-2
-      if: steps.codecov-vlen128-retry-1.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen128-retry-3
-      continue-on-error: true
-      id: codecov-vlen128-retry-3
-      if: steps.codecov-vlen128-retry-2.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen128-retry-4
-      continue-on-error: true
-      id: codecov-vlen128-retry-4
-      if: steps.codecov-vlen128-retry-3.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen128-retry-5
-      continue-on-error: true
-      id: codecov-vlen128-retry-5
-      if: steps.codecov-vlen128-retry-4.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: set codecov-vlen128 status
-      if: always()
-      run: |
-        if ${{ steps.codecov-vlen128.outcome=='success' || steps.codecov-vlen128-retry-1.outcome=='success' || steps.codecov-vlen128-retry-2.outcome=='success' || steps.codecov-vlen128-retry-3.outcome=='success' || steps.codecov-vlen128-retry-4.outcome=='success' || steps.codecov-vlen128-retry-5.outcome=='success' }}; then
-           echo fine
-        else
-           exit 1
-        fi
-
-    - name: test-vlen256
-      run: |
-        export PATH=/data/action/osd/qemu-install/bin:$PATH
-        cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
-
-    - name: lcov-collect-vlen256
-      run: |
-        cd build
-        lcov --gcov-tool /data/action/osd/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info
-        lcov -r lcov.info '/usr/*' -o lcov.info
-        lcov -r lcov.info '*/build/*' -o lcov.info
-        lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info
-        lcov --list lcov.info
-    - name: codecov-vlen256
-      id: codecov-vlen256
-      continue-on-error: true
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-1
-      continue-on-error: true
-      id: codecov-vlen256-retry-1
-      if: steps.codecov-vlen256.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-2
-      continue-on-error: true
-      id: codecov-vlen256-retry-2
-      if: steps.codecov-vlen256-retry-1.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-3
-      continue-on-error: true
-      id: codecov-vlen256-retry-3
-      if: steps.codecov-vlen256-retry-2.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-4
-      continue-on-error: true
-      id: codecov-vlen256-retry-4
-      if: steps.codecov-vlen256-retry-3.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: codecov-vlen256-retry-5
-      continue-on-error: true
-      id: codecov-vlen256-retry-5
-      if: steps.codecov-vlen256-retry-4.outcome=='failure'
-      uses: codecov/codecov-action@v3
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: build/lcov.info
-    - name: set codecov-vlen256 status
-      if: always()
-      run: |
-        if ${{ steps.codecov-vlen256.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then
-           echo fine
-        else
-           exit 1
-        fi
diff --git a/.github/workflows/windows-arm-cpu.yml b/.github/workflows/windows-arm-cpu.yml
index 2b1441ed74b..d789482a595 100644
--- a/.github/workflows/windows-arm-cpu.yml
+++ b/.github/workflows/windows-arm-cpu.yml
@@ -41,6 +41,8 @@ jobs:
             toolset-version: v143
             os: windows-2022
 
+    env:
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
     - name: build
diff --git a/.github/workflows/windows-arm64-cpu.yml b/.github/workflows/windows-arm64-cpu.yml
index 49cd6e200ff..a6bdbda01de 100644
--- a/.github/workflows/windows-arm64-cpu.yml
+++ b/.github/workflows/windows-arm64-cpu.yml
@@ -41,6 +41,8 @@ jobs:
             toolset-version: v143
             os: windows-2022
 
+    env:
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
     - name: build
diff --git a/.github/workflows/windows-x64-cpu-vs2019-python.yml b/.github/workflows/windows-x64-cpu-vs2019-python.yml
index 7ef6a1adcec..3d4e6583766 100644
--- a/.github/workflows/windows-x64-cpu-vs2019-python.yml
+++ b/.github/workflows/windows-x64-cpu-vs2019-python.yml
@@ -32,6 +32,8 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8, 3.9]
+    env:
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/windows-x64-cpu.yml b/.github/workflows/windows-x64-cpu.yml
index c93fc8adfe1..200185d1a56 100644
--- a/.github/workflows/windows-x64-cpu.yml
+++ b/.github/workflows/windows-x64-cpu.yml
@@ -55,6 +55,8 @@ jobs:
             toolset-version: v143
             os: windows-2022
 
+    env:
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
     - name: cache-protobuf
diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
index 4101e43ce25..3d707c17052 100644
--- a/.github/workflows/windows-x64-gpu.yml
+++ b/.github/workflows/windows-x64-gpu.yml
@@ -57,6 +57,8 @@ jobs:
             toolset-version: v143
             os: windows-2022
 
+    env:
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
       with:
diff --git a/.github/workflows/windows-x86-cpu.yml b/.github/workflows/windows-x86-cpu.yml
index 8e692e0d695..b48431a97ac 100644
--- a/.github/workflows/windows-x86-cpu.yml
+++ b/.github/workflows/windows-x86-cpu.yml
@@ -49,6 +49,8 @@ jobs:
             toolset-version: v143
             os: windows-2022
 
+    env:
+      UseMultiToolTask: true
     steps:
     - uses: actions/checkout@v3
     - name: build
diff --git a/.gitignore b/.gitignore
index de1330fdeb7..aa8ea4ddcb2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,6 +46,7 @@ build*/
 .idea
 cmake-build-debug
 cmake-build-release
+CMakeSettings.json
 
 # Compiled python
 __pycache__
@@ -56,3 +57,6 @@ python/setup.py
 
 # Clangd
 .cache/
+
+# Xmake
+.xmake/
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c453d23e544..8d0b4c63a1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,7 +96,7 @@ else()
     option(NCNN_BUILD_EXAMPLES "build examples" ON)
 endif()
 
-if(ANDROID OR IOS OR LINUX OR NCNN_SIMPLESTL)
+if(ANDROID OR IOS OR NCNN_SIMPLESTL)
     option(NCNN_DISABLE_EXCEPTION "disable exception" ON)
 else()
     option(NCNN_DISABLE_EXCEPTION "disable exception" OFF)
@@ -147,6 +147,7 @@ endif()
 if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
     OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
     OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)")
+    OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64")
     OR ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) AND (${CMAKE_GENERATOR_PLATFORM} MATCHES "^(arm|arm64)")))
     set(NCNN_TARGET_ARCH arm)
 
@@ -171,7 +172,7 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
 
         set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16")
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vbfmmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
 
         set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm")
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
@@ -290,16 +291,34 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
     else()
         message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.")
     endif()
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)")
+    set(NCNN_TARGET_ARCH loongarch)
+
+    include(CheckCXXCompilerFlag)
+
+    check_cxx_compiler_flag("-mlsx" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
+
+    if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
+        option(NCNN_LSX "optimize loongarch platform with lsx extension" ON)
+    else()
+        message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.")
+    endif()
+
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
     set(NCNN_TARGET_ARCH riscv)
 
     include(CheckCXXCompilerFlag)
 
     set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
-    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; float _v; word_type vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV)
+    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV)
 
     set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh")
-    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; word_type vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_FP16)
+    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZFH)
+
+    if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH)
+        set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
+        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH)
+    endif()
 
     unset(CMAKE_REQUIRED_FLAGS)
 
@@ -309,16 +328,24 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
         if(NCNN_RVV_CHECK_VFREDSUM)
             include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/ncnn_check_rvv_vfredusum.cmake)
         endif()
-        if(NOT NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NOT (NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH))
             message(WARNING "The compiler does not support risc-v zfh extension. Upgrading your toolchain is strongly recommended.")
         endif()
+        option(NCNN_RVV_CHECK_PLAIN_SEGMENT "check compilter about rvv segment load/store interface" ON)
+        if(NCNN_RVV_CHECK_PLAIN_SEGMENT)
+            set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
+            check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; size_t vl; float src[32]={.0f}; vlseg2e32_v_f32m1(&_s, &_w, src, vl); return 0; }" NCNN_COMPILER_USE_RVV_PLAIN_SEG)
+            unset(CMAKE_REQUIRED_FLAGS)
+        endif()
+        if(NOT NCNN_COMPILER_USE_RVV_PLAIN_SEG)
+            message(WARNING "The compiler uses tuple types for segment load/store. Upgrading your toolchain is strongly recommended.")
+            add_definitions(-D__rvv_tuple)
+        endif()
     else()
         message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.")
     endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
     set(NCNN_TARGET_ARCH powerpc)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch)")
-    set(NCNN_TARGET_ARCH mips)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")
     set(NCNN_TARGET_ARCH xtensa)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)")
@@ -469,29 +496,38 @@ endif()
 
 if(NCNN_VULKAN)
     if(NCNN_SYSTEM_GLSLANG)
-        set(GLSLANG_TARGET_DIR "GLSLANG-NOTFOUND" CACHE PATH "Absolute path to glslangTargets.cmake directory")
-        if(NOT GLSLANG_TARGET_DIR AND NOT DEFINED ENV{GLSLANG_TARGET_DIR})
-            message(WARNING "GLSLANG_TARGET_DIR must be defined! NCNN_SYSTEM_GLSLANG will be turned off.")
-            set(NCNN_SYSTEM_GLSLANG OFF)
+        find_package(Threads)
+        find_package(glslang QUIET)
+        if(glslang_FOUND)
+            add_library(glslang ALIAS glslang::glslang)
+            add_library(SPIRV ALIAS glslang::SPIRV)
         else()
-            message(STATUS "Using glslang install located at ${GLSLANG_TARGET_DIR}")
-
-            find_package(Threads)
-
-            include("${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake")
-            include("${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake")
-            if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
-                # hlsl support can be optional
-                include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
-            endif()
-            include("${GLSLANG_TARGET_DIR}/glslangTargets.cmake")
-            include("${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake")
-
-            if (NOT TARGET glslang OR NOT TARGET SPIRV)
-                message(WARNING "glslang or SPIRV target not found! NCNN_SYSTEM_GLSLANG will be turned off.")
+            set(GLSLANG_TARGET_DIR "GLSLANG-NOTFOUND" CACHE PATH "Absolute path to glslangTargets.cmake directory")
+            if(NOT GLSLANG_TARGET_DIR AND NOT DEFINED ENV{GLSLANG_TARGET_DIR})
+                message(WARNING "set glslang_DIR to glslang-config.cmake directory for using system glslang.")
+                message(WARNING "GLSLANG_TARGET_DIR must be defined! NCNN_SYSTEM_GLSLANG will be turned off.")
                 set(NCNN_SYSTEM_GLSLANG OFF)
+            else()
+                include("${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake")
+                include("${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake")
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include("${GLSLANG_TARGET_DIR}/glslangTargets.cmake")
+                include("${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake")
             endif()
         endif()
+
+        if (TARGET glslang AND TARGET SPIRV)
+            get_property(glslang_location TARGET glslang PROPERTY LOCATION)
+            get_property(SPIRV_location TARGET SPIRV PROPERTY LOCATION)
+            message(STATUS "Found glslang: ${glslang_location} (found version \"${glslang_VERSION}\")")
+            message(STATUS "Found SPIRV: ${SPIRV_location} (found version \"${glslang_VERSION}\")")
+        else()
+            message(WARNING "glslang or SPIRV target not found! NCNN_SYSTEM_GLSLANG will be turned off.")
+            set(NCNN_SYSTEM_GLSLANG OFF)
+        endif()
     endif()
 
     if(NOT NCNN_SYSTEM_GLSLANG)
diff --git a/README.md b/README.md
index c25a985aad4..6c71520e0f2 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ ncnn 目前已在腾讯多款应用中使用，如：QQ，Qzone，微信，天
 
 ## Current building status matrix
 
-| System            | CPU (32bit]                                                         | CPU (64bit)                                                                     | GPU (32bit)                                                     | GPU (64bit)                                                         |
+| System            | CPU (32bit)                                                         | CPU (64bit)                                                                     | GPU (32bit)                                                     | GPU (64bit)                                                         |
 | :---------------- | :------------------------------------------------------------------ | :------------------------------------------------------------------------------ | :-------------------------------------------------------------- | :------------------------------------------------------------------ |
 | Linux (GCC)       | [![Build Status][pass-linux-x86-cpu-gcc]][ci-linux-x86-cpu-gcc]     | [![Build Status][pass-linux-x64-cpu-gcc]][ci-linux-x64-cpu-gcc]                 | —                                                               | [![Build Status][pass-linux-x64-gpu-gcc]][ci-linux-x64-gpu-gcc]     |
 | Linux (Clang)     | [![Build Status][pass-linux-x86-cpu-clang]][ci-linux-x86-cpu-clang] | [![Build Status][pass-linux-x64-cpu-clang]][ci-linux-x64-cpu-clang]             | —                                                               | [![Build Status][pass-linux-x64-gpu-clang]][ci-linux-x64-gpu-clang] |
@@ -298,7 +298,7 @@ ncnn 目前已在腾讯多款应用中使用，如：QQ，Qzone，微信，天
 
 ---
 
-## Example project
+## Project examples
 
 - <https://github.com/nihui/ncnn-android-squeezenet>
 - <https://github.com/nihui/ncnn-android-styletransfer>
@@ -307,10 +307,12 @@ ncnn 目前已在腾讯多款应用中使用，如：QQ，Qzone，微信，天
 - <https://github.com/nihui/ncnn-android-yolov5>
 - <https://github.com/xiang-wuu/ncnn-android-yolov7>
 - <https://github.com/nihui/ncnn-android-scrfd> 🤩
+- <https://github.com/shaoshengsong/qt_android_ncnn_lib_encrypt_example>
 
-<img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-2.jpg" width="360" height="640"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/4.jpg" width="360" height="640"/>
-<img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-33.jpg" width="360" height="640"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-m.png" width="360" height="640"/>
-<img src="https://github.com/nihui/ncnn-android-yolov5/raw/master/screenshot.jpg" width="360" height="800"/>功能概述<img src="https://github.com/nihui/ncnn-android-scrfd/raw/master/screenshot.jpg" width="360" height="800"/>
+<img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-2.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/4.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-33.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-m.png" height ="230"/><img src="https://github.com/nihui/ncnn-android-yolov5/raw/master/screenshot.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-android-scrfd/raw/master/screenshot.jpg" height ="230"/><br>
+
+- <https://github.com/magicse/ncnn-colorization-siggraph17><br>
+<img src="https://user-images.githubusercontent.com/13585785/189326958-f5a8d6f8-caef-49bf-88da-ae494371195d.jpg" width ="700"/>
 
 ---
 
diff --git a/benchmark/README.md b/benchmark/README.md
index dca02a2c82a..004283d7682 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -4236,7 +4236,7 @@ cooling_down = 0
       yolo-fastestv2  min =   91.08  max =  102.93  avg =   94.41
 ```
 
-### ### AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8)
+### AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8)
 test in wsl2 with ubuntu 20.04
 ```
 $ ./benchncnn  10 1 0 -1 0
@@ -4370,3 +4370,168 @@ cooling_down = 1
   vision_transformer  min = 22201.32  max = 22510.75  avg = 22315.09
           FastestDet  min =  146.94  max =  148.50  avg =  147.44
 ```
+
+### T-Head TH1520 (C910V<aka C920>, 1.848 GHz x 4)
+
+Tested on `Linux anolis-riscv 5.10.112-00579-g8e3db308d5a5 #23 SMP PREEMPT Fri Aug 12 10:17:32 CST 2022 riscv64 riscv64 riscv64 GNU/Linux`
+
+```
+[root@anolis-riscv benchmark]# ./benchncnn 
+syscall error -1
+loop_count = 4
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =  187.88  max =  188.82  avg =  188.13
+     squeezenet_int8  min = 2388.26  max = 2446.92  avg = 2411.46
+           mobilenet  min =  321.46  max =  323.34  avg =  322.19
+      mobilenet_int8  min = 2318.93  max = 2458.55  avg = 2400.99
+        mobilenet_v2  min =  214.01  max =  216.00  avg =  215.35
+        mobilenet_v3  min =  247.71  max =  248.18  avg =  247.96
+          shufflenet  min =  155.58  max =  155.85  avg =  155.67
+       shufflenet_v2  min =   99.50  max =   99.75  avg =   99.63
+             mnasnet  min =  261.46  max =  263.83  avg =  262.53
+     proxylessnasnet  min =  315.40  max =  316.89  avg =  316.28
+     efficientnet_b0  min =  484.97  max =  486.16  avg =  485.55
+   efficientnetv2_b0  min =  453.03  max =  453.40  avg =  453.21
+        regnety_400m  min =  314.09  max =  315.33  avg =  314.77
+           blazeface  min =   46.14  max =   46.69  avg =   46.39
+           googlenet  min =  650.99  max =  653.60  avg =  651.69
+      googlenet_int8  min = 5435.11  max = 6391.98  avg = 6012.81
+            resnet18  min =  505.48  max =  506.70  avg =  506.06
+       resnet18_int8  min = 5053.33  max = 6599.94  avg = 6001.86
+             alexnet  min =  403.68  max =  404.60  avg =  404.23
+               vgg16  min = 2731.55  max = 2746.48  avg = 2738.82
+```
+
+### Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz)
+test in ROCK5 MODEL B
+
+```
+rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn  10 1 0 -1 0
+loop_count = 10
+num_threads = 1
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =   15.22  max =   16.03  avg =   15.70
+     squeezenet_int8  min =   16.77  max =   16.96  avg =   16.86
+           mobilenet  min =   23.07  max =   23.58  avg =   23.36
+      mobilenet_int8  min =   18.58  max =   18.90  avg =   18.72
+        mobilenet_v2  min =   18.74  max =   19.10  avg =   18.96
+        mobilenet_v3  min =   14.40  max =   14.65  avg =   14.50
+          shufflenet  min =    9.74  max =    9.88  avg =    9.84
+       shufflenet_v2  min =    9.44  max =    9.55  avg =    9.50
+             mnasnet  min =   14.73  max =   15.03  avg =   14.87
+     proxylessnasnet  min =   18.37  max =   18.59  avg =   18.46
+     efficientnet_b0  min =   29.11  max =   30.18  avg =   29.63
+   efficientnetv2_b0  min =   46.40  max =   46.95  avg =   46.76
+        regnety_400m  min =   19.18  max =   19.39  avg =   19.28
+           blazeface  min =    5.16  max =    5.23  avg =    5.20
+           googlenet  min =   64.64  max =   65.33  avg =   65.00
+      googlenet_int8  min =   61.86  max =   63.41  avg =   62.42
+            resnet18  min =   42.00  max =   43.34  avg =   42.48
+       resnet18_int8  min =   67.22  max =   67.80  avg =   67.45
+             alexnet  min =   57.65  max =   58.21  avg =   58.01
+               vgg16  min =  192.35  max =  193.36  avg =  192.84
+          vgg16_int8  min =  570.86  max =  578.81  avg =  574.50
+            resnet50  min =  107.86  max =  109.52  avg =  108.70
+       resnet50_int8  min =  134.41  max =  135.86  avg =  135.18
+      squeezenet_ssd  min =   40.85  max =   41.24  avg =   41.02
+ squeezenet_ssd_int8  min =   52.23  max =   53.70  avg =   52.54
+       mobilenet_ssd  min =   45.11  max =   45.50  avg =   45.32
+  mobilenet_ssd_int8  min =   36.53  max =   36.63  avg =   36.59
+      mobilenet_yolo  min =   95.18  max =   96.79  avg =   95.90
+  mobilenetv2_yolov3  min =   65.50  max =   65.88  avg =   65.72
+         yolov4-tiny  min =   86.13  max =   88.84  avg =   87.29
+           nanodet_m  min =   22.57  max =   22.87  avg =   22.74
+    yolo-fastest-1.1  min =    9.23  max =    9.35  avg =    9.29
+      yolo-fastestv2  min =    8.62  max =    8.83  avg =    8.73
+  vision_transformer  min = 3077.54  max = 3396.13  avg = 3339.58
+          FastestDet  min =    9.11  max =    9.30  avg =    9.20
+
+rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn  10 8 0 -1 0
+loop_count = 10
+num_threads = 8
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =   10.02  max =   11.01  avg =   10.43
+     squeezenet_int8  min =   11.78  max =   13.77  avg =   12.55
+           mobilenet  min =   12.75  max =   13.58  avg =   13.12
+      mobilenet_int8  min =   12.23  max =   14.29  avg =   13.54
+        mobilenet_v2  min =   12.76  max =   14.27  avg =   13.40
+        mobilenet_v3  min =    9.51  max =    9.81  avg =    9.71
+          shufflenet  min =    7.06  max =    7.23  avg =    7.13
+       shufflenet_v2  min =    6.21  max =    7.32  avg =    6.38
+             mnasnet  min =    9.32  max =   12.49  avg =   10.75
+     proxylessnasnet  min =   13.79  max =   15.51  avg =   14.70
+     efficientnet_b0  min =   16.59  max =   17.99  avg =   17.08
+   efficientnetv2_b0  min =   28.26  max =   32.26  avg =   30.52
+        regnety_400m  min =   13.43  max =   15.00  avg =   13.72
+           blazeface  min =    3.87  max =    7.38  avg =    5.65
+           googlenet  min =   29.18  max =   44.00  avg =   36.31
+      googlenet_int8  min =   31.14  max =   37.48  avg =   34.58
+            resnet18  min =   21.47  max =   24.40  avg =   22.35
+       resnet18_int8  min =   26.68  max =   29.89  avg =   28.45
+             alexnet  min =   29.35  max =   38.09  avg =   31.65
+               vgg16  min =  112.37  max =  122.94  avg =  117.05
+          vgg16_int8  min =  161.08  max =  215.29  avg =  176.89
+            resnet50  min =   54.54  max =   57.50  avg =   55.71
+       resnet50_int8  min =   54.76  max =   65.05  avg =   60.59
+      squeezenet_ssd  min =   26.21  max =   35.05  avg =   30.76
+ squeezenet_ssd_int8  min =   33.34  max =   40.88  avg =   36.19
+       mobilenet_ssd  min =   26.71  max =   28.85  avg =   27.88
+  mobilenet_ssd_int8  min =   22.03  max =   25.31  avg =   24.21
+      mobilenet_yolo  min =   60.51  max =   74.65  avg =   65.45
+  mobilenetv2_yolov3  min =   37.27  max =   44.13  avg =   41.20
+         yolov4-tiny  min =   49.84  max =   58.12  avg =   53.93
+           nanodet_m  min =   16.54  max =   22.41  avg =   20.60
+    yolo-fastest-1.1  min =    8.49  max =   13.50  avg =    9.91
+      yolo-fastestv2  min =    6.28  max =   11.22  avg =    8.00
+  vision_transformer  min =  968.62  max = 1063.47  avg = 1019.12
+          FastestDet  min =    6.14  max =   11.92  avg =    7.85
+
+rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 4 2 -1 0
+loop_count = 10
+num_threads = 4
+powersave = 2
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    6.78  max =    7.27  avg =    7.07
+     squeezenet_int8  min =    4.58  max =    4.73  avg =    4.63
+           mobilenet  min =    5.67  max =    5.78  avg =    5.72
+      mobilenet_int8  min =    5.01  max =    5.20  avg =    5.15
+        mobilenet_v2  min =    5.44  max =    5.76  avg =    5.50
+        mobilenet_v3  min =    4.67  max =    5.03  avg =    4.74
+          shufflenet  min =    4.22  max =    4.30  avg =    4.27
+       shufflenet_v2  min =    3.48  max =    3.60  avg =    3.53
+             mnasnet  min =    4.52  max =    4.83  avg =    4.61
+     proxylessnasnet  min =    5.44  max =    6.01  avg =    5.56
+     efficientnet_b0  min =    8.33  max =    8.52  avg =    8.41
+   efficientnetv2_b0  min =   12.95  max =   13.08  avg =   13.02
+        regnety_400m  min =    8.60  max =    8.73  avg =    8.66
+           blazeface  min =    1.86  max =    1.95  avg =    1.90
+           googlenet  min =   16.58  max =   16.85  avg =   16.65
+      googlenet_int8  min =   16.99  max =   17.13  avg =   17.06
+            resnet18  min =   14.98  max =   15.30  avg =   15.08
+       resnet18_int8  min =   20.10  max =   20.22  avg =   20.15
+             alexnet  min =   19.78  max =   20.21  avg =   19.87
+               vgg16  min =   66.35  max =   94.16  avg =   75.24
+          vgg16_int8  min =  131.02  max =  131.98  avg =  131.51
+            resnet50  min =   28.07  max =   28.78  avg =   28.28
+       resnet50_int8  min =   33.56  max =   35.53  avg =   33.84
+      squeezenet_ssd  min =   16.40  max =   16.80  avg =   16.49
+ squeezenet_ssd_int8  min =   18.64  max =   19.00  avg =   18.76
+       mobilenet_ssd  min =   13.66  max =   13.78  avg =   13.72
+  mobilenet_ssd_int8  min =   11.23  max =   11.42  avg =   11.33
+      mobilenet_yolo  min =   30.76  max =   31.03  avg =   30.86
+  mobilenetv2_yolov3  min =   19.28  max =   21.07  avg =   20.30
+         yolov4-tiny  min =   33.44  max =   37.68  avg =   34.70
+           nanodet_m  min =    8.28  max =    8.55  avg =    8.38
+    yolo-fastest-1.1  min =    4.30  max =    4.40  avg =    4.34
+      yolo-fastestv2  min =    4.07  max =    4.18  avg =    4.13
+  vision_transformer  min =  815.67  max =  819.27  avg =  817.49
+          FastestDet  min =    4.34  max =    7.47  avg =    5.18
+```
\ No newline at end of file
diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index 032e3f9fbc4..714dca3180f 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -161,8 +161,8 @@ void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& op
 int main(int argc, char** argv)
 {
     int loop_count = 4;
-    int num_threads = ncnn::get_cpu_count();
-    int powersave = 0;
+    int num_threads = ncnn::get_physical_big_cpu_count();
+    int powersave = 2;
     int gpu_device = -1;
     int cooling_down = 1;
 
@@ -199,8 +199,8 @@ int main(int argc, char** argv)
 
     g_loop_count = loop_count;
 
-    g_blob_pool_allocator.set_size_compare_ratio(0.0f);
-    g_workspace_pool_allocator.set_size_compare_ratio(0.5f);
+    g_blob_pool_allocator.set_size_compare_ratio(0.f);
+    g_workspace_pool_allocator.set_size_compare_ratio(0.f);
 
 #if NCNN_VULKAN
     if (use_vulkan_compute)
diff --git a/cmake/ncnnConfig.cmake.in b/cmake/ncnnConfig.cmake.in
index b118713571e..528c69da0ad 100644
--- a/cmake/ncnnConfig.cmake.in
+++ b/cmake/ncnnConfig.cmake.in
@@ -19,20 +19,24 @@ if(NCNN_VULKAN)
 
     if(NOT NCNN_SHARED_LIB)
         if(NCNN_SYSTEM_GLSLANG)
-            set(GLSLANG_TARGET_DIR "@GLSLANG_TARGET_DIR@")
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "@GLSLANG_TARGET_DIR@")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
         else()
-            set(GLSLANG_TARGET_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_LIBDIR@/cmake")
-        endif(NCNN_SYSTEM_GLSLANG)
-
-        include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
-        include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
-        if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
-            # hlsl support can be optional
-            include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_LIBDIR@/cmake/glslang")
+            find_package(glslang QUIET)
         endif()
-        include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
-        include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+
     endif()
-endif(NCNN_VULKAN)
+endif()
 
 include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index 89d61823deb..857d3b528ba 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -270,9 +270,17 @@ macro(ncnn_add_layer class)
         endif()
     endif()
 
+    if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "loongarch")
+        if(NCNN_LSX)
+            ncnn_add_arch_opt_layer(${class} lsx "-mlsx")
+        endif()
+    endif()
+
     if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv")
-        if(NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
             ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh")
+        elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
+            ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
         elseif(NCNN_COMPILER_SUPPORT_RVV)
             ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
         endif()
diff --git a/cmake/ncnn_check_rvv_vfredusum.cmake b/cmake/ncnn_check_rvv_vfredusum.cmake
index 81496a765d1..59065556356 100644
--- a/cmake/ncnn_check_rvv_vfredusum.cmake
+++ b/cmake/ncnn_check_rvv_vfredusum.cmake
@@ -9,7 +9,7 @@ int main(void)
 {
     float in1[4] = {-1.f,0.f,+1.f,2.f};
     float out1=0;
-    word_type vl = vsetvl_e32m8(4);
+    size_t vl = vsetvl_e32m8(4);
     vfloat32m8_t _add = vle32_v_f32m8(in1,vl);
     vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl);
     _sum = vfredsum_vs_f32m8_f32m1(_sum, _add, _sum, vl);
@@ -23,7 +23,7 @@ int main(void)
 {
     float in1[4] = {-1.f,0.f,+1.f,2.f};
     float out1=0;
-    word_type vl = vsetvl_e32m8(4);
+    size_t vl = vsetvl_e32m8(4);
     vfloat32m8_t _add = vle32_v_f32m8(in1,vl);
     vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl);
     _sum = vfredusum_vs_f32m8_f32m1(_sum, _add, _sum, vl);
@@ -36,7 +36,7 @@ if(NCNN_COMPILER_USE_VFREDSUM AND NOT NCNN_COMPILER_USE_VFREDUSUM)
     message(WARNING "The compiler uses vfredsum. Upgrading your toolchain is strongly recommended.")
     foreach(LMUL 1 2 4 8)
         add_definitions(-Dvfredusum_vs_f32m${LMUL}_f32m1=vfredsum_vs_f32m${LMUL}_f32m1)
-        if(NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH)
             add_definitions(-Dvfredusum_vs_f16m${LMUL}_f16m1=vfredsum_vs_f16m${LMUL}_f16m1)
         endif()
     endforeach()
diff --git a/cmake/ncnn_generate_lsx_source.cmake b/cmake/ncnn_generate_lsx_source.cmake
new file mode 100644
index 00000000000..4f8fb20299a
--- /dev/null
+++ b/cmake/ncnn_generate_lsx_source.cmake
@@ -0,0 +1,14 @@
+
+# must define SRC DST CLASS
+
+file(READ ${SRC} source_data)
+
+# replace
+string(TOUPPER ${CLASS} CLASS_UPPER)
+string(TOLOWER ${CLASS} CLASS_LOWER)
+
+string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}")
+string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}")
+string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}")
+
+file(WRITE ${DST} "${source_data}")
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 5366da1e112..81e04f1e6dc 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -29,7 +29,9 @@
 * [Exp](#exp)
 * [Flatten](#flatten)
 * [GELU](#gelu)
+* [GLU](#glu)
 * [Gemm](#gemm)
+* [GridSample](#gridsample)
 * [GroupNorm](#groupnorm)
 * [GRU](#gru)
 * [HardSigmoid](#hardsigmoid)
@@ -784,6 +786,22 @@ else                y = 0.5 * x * erfc(-0.70710678 * x)
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | fast_gelu     | int   | 0         | use approximation |
 
+# GLU
+
+If axis < 0, we use axis = x.dims + axis
+
+GLU(a,b)=a⊗σ(b)
+
+where a is the first half of the input matrix and b is the second half.
+
+axis specifies the dimension to split the input
+
+* one_blob_only
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | axis          | int   | 0         |                   |
+
 # Gemm
 ```
 a = transA ? transpose(x0) : x0
@@ -799,6 +817,34 @@ y = gemm(a, b) * alpha + c * beta
 | 2         | transA        | int   | 0         |                   |
 | 3         | transb        | int   | 0         |                   |
 
+# GridSample
+```
+Given an input and a flow-field grid, computes the output using input values and pixel locations from grid.
+
+For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, 
+which are used to interpolate the output value output[:, h2, w2]
+
+This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks .
+```
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | sample_type   | int   | 1         |                   |
+| 1         | padding_mode  | int   | 1         |                   |
+| 2         | align_corner  | int   | 0         |                   |
+
+
+Sample type:
+- 1 = Nearest
+- 2 = Bilinear
+- 3 = Bicubic
+
+Padding mode:
+- 1 = zeros
+- 2 = border
+- 3 = reflection
+
+
 # GroupNorm
 ```
 split x along channel axis into group x0, x1 ...
@@ -1026,15 +1072,17 @@ y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2)
 
 | param id  | name          | type  | default   | description       |
 | --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
+| 0         | num_output    | int   | 0         | output size of output |
 | 1         | weight_data_size| int | 0         | total size of IFOG weight matrix |
 | 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| 3         | hidden_size   | int   | num_output| hidden size       |
 
 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output * 4, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output * 4, num_directions] |
+| weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
+| bias_c_data   | float/fp16/int8 | [hidden_size, 4, num_directions] |
+| weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
+| weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] |
 
 Direction flag:
 - 0 = forward only
@@ -1084,14 +1132,16 @@ y = affine(out)
 | 0         | embed_dim     | int   | 0         |                   |
 | 1         | num_head      | int   | 1         |                   |
 | 2         | weight_data_size| int | 0         |                   |
+| 3         | kdim          | int   | embed_dim |                   |
+| 4         | vdim          | int   | embed_dim |                   |
 
 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
 | q_weight_data | float/fp16/int8 | [weight_data_size] |
 | q_bias_data   | float | [embed_dim]           |
-| k_weight_data | float/fp16/int8 | [weight_data_size] |
+| k_weight_data | float/fp16/int8 | [embed_dim * kdim] |
 | k_bias_data   | float | [embed_dim]           |
-| v_weight_data | float/fp16/int8 | [weight_data_size] |
+| v_weight_data | float/fp16/int8 | [embed_dim * vdim] |
 | v_bias_data   | float | [embed_dim]           |
 | out_weight_data| float/fp16/int8 | [weight_data_size] |
 | out_bias_data | float | [embed_dim]           |
diff --git a/docs/faq.en.md b/docs/faq.en.md
index 8675a2fc9d5..072d0b33e26 100644
--- a/docs/faq.en.md
+++ b/docs/faq.en.md
@@ -216,7 +216,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice),
 
    2. [Learn in 5 minutes! Converting TorchScript models to ncnn models with PNNX](https://zhuanlan.zhihu.com/p/427512763)
 
-# 使用
+# Using
 
 - ## vkEnumeratePhysicalDevices failed -3
 
@@ -290,4 +290,4 @@ Fully customizable op, first change to one that can export (e.g. concat slice),
    ncnn::Mat in1(60, (void*)testData.data()).reshape(4, 5, 3); // just pass the pointer to the float data as a void*, and even specify the dimension (up says it's best to use reshape to solve the channel gap)
    float* a = new float[60]; // New a piece of memory yourself, you need to release it later
    ncnn::Mat in2 = ncnn::Mat(60, (void*)a).reshape(4, 5, 3).clone(); // use the same method as above, clone() to transfer data owner
-   ```
\ No newline at end of file
+   ```
diff --git a/docs/faq.md b/docs/faq.md
index 4701414c57d..8d72b792dfa 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -111,7 +111,7 @@
 
 # 怎样添加ncnn库到项目中？cmake方式怎么用？
 
-编译ncnn，make install。linux/windows set/export ncnn_DIR 指向 isntall目录下下包含ncnnConfig.cmake 的目录
+编译ncnn，make install。linux/windows set/export ncnn_DIR 指向 install目录下包含ncnnConfig.cmake 的目录
 
 - ## android
 
diff --git a/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md b/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md
index 526de5ab265..f4cf49399f8 100644
--- a/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md
+++ b/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md
@@ -170,3 +170,19 @@ ncnn::Net net;
 // param_buffer is the content buffe of XYZ.param file
 net.load_param_mem(param_buffer);
 ```
+
+
+### disable fp16
+
+Some models may overflow fp16, resulting in a nan result.
+
+So try to turn off fp16 lower-precision optimizations, and the precision will be improved to fp32 to investigate and solve the overflow problem caused by this.
+
+You can set it as follows
+```cpp
+ncnn::Net net;
+
+net.opt.use_fp16_packed = false;
+net.opt.use_fp16_storage = false;
+net.opt.use_fp16_arithmetic = false;
+```
\ No newline at end of file
diff --git a/examples/yolov5.cpp b/examples/yolov5.cpp
index b1a8e849553..88f6db21222 100644
--- a/examples/yolov5.cpp
+++ b/examples/yolov5.cpp
@@ -26,9 +26,10 @@
 #include <stdio.h>
 #include <vector>
 
-#define YOLOV5_V60 1 //YOLOv5 v6.0
+//#define YOLOV5_V60 1 //YOLOv5 v6.0
+#define YOLOV5_V62 1 //YOLOv5 v6.2 export  onnx model method https://github.com/shaoshengsong/yolov5_62_export_ncnn
 
-#if YOLOV5_V60
+#if YOLOV5_V60 || YOLOV5_V62
 #define MAX_STRIDE 64
 #else
 #define MAX_STRIDE 32
@@ -79,7 +80,7 @@ class YoloV5Focus : public ncnn::Layer
 };
 
 DEFINE_LAYER_CREATOR(YoloV5Focus)
-#endif //YOLOV5_V60
+#endif //YOLOV5_V60    YOLOV5_V62
 
 struct Object
 {
@@ -278,7 +279,12 @@ static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
 
     // original pretrained model from https://github.com/ultralytics/yolov5
     // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
-#if YOLOV5_V60
+#if YOLOV5_V62
+    if (yolov5.load_param("yolov5s_6.2.param"))
+        exit(-1);
+    if (yolov5.load_model("yolov5s_6.2.bin"))
+        exit(-1);
+#elif YOLOV5_V60
     if (yolov5.load_param("yolov5s_6.0.param"))
         exit(-1);
     if (yolov5.load_model("yolov5s_6.0.bin"))
@@ -358,7 +364,10 @@ static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
     // stride 16
     {
         ncnn::Mat out;
-#if YOLOV5_V60
+
+#if YOLOV5_V62
+        ex.extract("353", out);
+#elif YOLOV5_V60
         ex.extract("376", out);
 #else
         ex.extract("781", out);
@@ -381,7 +390,9 @@ static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
     // stride 32
     {
         ncnn::Mat out;
-#if YOLOV5_V60
+#if YOLOV5_V62
+        ex.extract("367", out);
+#elif YOLOV5_V60
         ex.extract("401", out);
 #else
         ex.extract("801", out);
diff --git a/glslang b/glslang
index 86ff4bca1dd..88fd417b0bb 160000
--- a/glslang
+++ b/glslang
@@ -1 +1 @@
-Subproject commit 86ff4bca1ddc7e2262f119c16e7228d0efb67610
+Subproject commit 88fd417b0bb7d91755961c70e846d274c182f2b0
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 140555ce706..999efa1deb6 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -8,6 +8,19 @@ add_definitions(-DVERSION_INFO="${PACKAGE_VERSION}")
 set( CMAKE_CXX_STANDARD 11 )
 set( CMAKE_CXX_STANDARD_REQUIRED ON )
 
+if(CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64")
+    option(PYBIND11_PYTHONLIBS_OVERWRITE "" OFF)
+
+    set(PYTHON_PREFIX "$ENV{LOCALAPPDATA}/pypa/cibuildwheel/Cache/nuget-cpython/pythonarm64.$ENV{PYTHON_VERSION}/tools")
+    if(NOT DEFINED $ENV{CIBUILDWHEEL})
+        message(WARNING
+            " This is hack for cibuildwheel on github action\n"
+            " Use the right way to cross-compile python module for windows arm64 like follows\n"
+            " set(PYTHON_PREFIX \"<your-pythonarm64-root-path>\")\n"
+        )
+    endif()
+endif()
+
 add_subdirectory(pybind11)
 
 if("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "")
diff --git a/python/pybind11 b/python/pybind11
index 70a58c577ea..80dc998efce 160000
--- a/python/pybind11
+++ b/python/pybind11
@@ -1 +1 @@
-Subproject commit 70a58c577eaf067748c2ec31bfd0b0a614cffba6
+Subproject commit 80dc998efced8ceb2be59756668a7e90e8bef917
diff --git a/python/src/main.cpp b/python/src/main.cpp
index cef29e9a053..c90b289ef37 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -287,17 +287,18 @@ PYBIND11_MODULE(ncnn, m)
     .def_buffer([](Mat& m) -> py::buffer_info {
         return to_buffer_info(m);
     })
-    .def("numpy", [](py::object obj, const std::string& format="") -> py::array {
+    .def(
+    "numpy", [](py::object obj, const std::string& format = "") -> py::array {
         auto* m = obj.cast<Mat*>();
         return py::array(to_buffer_info(*m, format), obj);
-    }, py::arg("format")="")
+    },
+    py::arg("format") = "", "i for int32, f for float32, d for double")
     //.def("fill", (void (Mat::*)(int))(&Mat::fill), py::arg("v"))
     .def("fill", (void (Mat::*)(float))(&Mat::fill), py::arg("v"))
     .def("clone", &Mat::clone, py::arg("allocator") = nullptr)
     .def("clone_from", &Mat::clone_from, py::arg("mat"), py::arg("allocator") = nullptr)
     .def(
-        "reshape",
-    [](Mat& mat, py::tuple shape, Allocator* allocator) {
+    "reshape", [](Mat& mat, py::tuple shape, Allocator* allocator) {
         switch (shape.size())
         {
         case 1:
@@ -316,18 +317,13 @@ PYBIND11_MODULE(ncnn, m)
         return Mat();
     },
     py::arg("shape") = py::tuple(1), py::arg("allocator") = nullptr)
-    .def("reshape", (Mat(Mat::*)(int, Allocator*) const) & Mat::reshape,
-         py::arg("w"), py::kw_only(), py::arg("allocator") = nullptr)
-    .def("reshape", (Mat(Mat::*)(int, int, Allocator*) const) & Mat::reshape,
-         py::arg("w"), py::arg("h"), py::kw_only(), py::arg("allocator") = nullptr)
-    .def("reshape", (Mat(Mat::*)(int, int, int, Allocator*) const) & Mat::reshape,
-         py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr)
-    .def("reshape", (Mat(Mat::*)(int, int, int, int, Allocator*) const) & Mat::reshape,
-         py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr)
+    .def("reshape", (Mat(Mat::*)(int, Allocator*) const) & Mat::reshape, py::arg("w"), py::kw_only(), py::arg("allocator") = nullptr)
+    .def("reshape", (Mat(Mat::*)(int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("allocator") = nullptr)
+    .def("reshape", (Mat(Mat::*)(int, int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr)
+    .def("reshape", (Mat(Mat::*)(int, int, int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr)
 
     .def(
-        "create",
-    [](Mat& mat, py::tuple shape, size_t elemsize, int elempack, Allocator* allocator) {
+    "create", [](Mat& mat, py::tuple shape, size_t elemsize, int elempack, Allocator* allocator) {
         switch (shape.size())
         {
         case 1:
@@ -345,23 +341,12 @@ PYBIND11_MODULE(ncnn, m)
         }
         return;
     },
-    py::arg("shape"), py::kw_only(),
-    py::arg("elemsize") = 4, py::arg("elempack") = 1,
-    py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, size_t, int, Allocator*)) & Mat::create,
-         py::arg("w"), py::kw_only(),
-         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, int, size_t, int, Allocator*)) & Mat::create,
-         py::arg("w"), py::arg("h"), py::kw_only(),
-         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, int, int, size_t, int, Allocator*)) & Mat::create,
-         py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(),
-         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, int, int, int, size_t, int, Allocator*)) & Mat::create,
-         py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(),
-         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create_like", (void (Mat::*)(const Mat&, Allocator*)) & Mat::create_like,
-         py::arg("m"), py::arg("allocator") = nullptr)
+    py::arg("shape"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
+    .def("create_like", (void (Mat::*)(const Mat&, Allocator*)) & Mat::create_like, py::arg("m"), py::arg("allocator") = nullptr)
     .def("addref", &Mat::addref)
     .def("release", &Mat::release)
     .def("empty", &Mat::empty)
@@ -373,8 +358,7 @@ PYBIND11_MODULE(ncnn, m)
     .def("depth", (Mat(Mat::*)(int)) & Mat::depth, py::arg("z"))
     //.def("depth", (const Mat (Mat::*)(int) const) & Mat::depth, py::arg("z"))
     .def(
-        "row",
-    [](Mat& m, int y) {
+    "row", [](Mat& m, int y) {
         if (m.elempack != 1)
         {
             std::stringstream ss;
diff --git a/python/src/pybind11_allocator.h b/python/src/pybind11_allocator.h
index 7c568209cfe..64ce553c752 100644
--- a/python/src/pybind11_allocator.h
+++ b/python/src/pybind11_allocator.h
@@ -25,11 +25,11 @@ class PyAllocator : public Base
     using Base::Base; // Inherit constructors
     void* fastMalloc(size_t size) override
     {
-        PYBIND11_OVERLOAD_PURE(void*, Base, fastMalloc, size);
+        PYBIND11_OVERRIDE_PURE(void*, Base, fastMalloc, size);
     }
     void fastFree(void* ptr) override
     {
-        PYBIND11_OVERLOAD_PURE(void, Base, fastFree, ptr);
+        PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr);
     }
 };
 
@@ -40,11 +40,11 @@ class PyAllocatorOther : public PyAllocator<Other>
     using PyAllocator<Other>::PyAllocator;
     void* fastMalloc(size_t size) override
     {
-        PYBIND11_OVERLOAD(void*, Other, fastMalloc, size);
+        PYBIND11_OVERRIDE(void*, Other, fastMalloc, size);
     }
     void fastFree(void* ptr) override
     {
-        PYBIND11_OVERLOAD(void, Other, fastFree, ptr);
+        PYBIND11_OVERRIDE(void, Other, fastFree, ptr);
     }
 };
 
@@ -56,23 +56,23 @@ class PyVkAllocator : public Base
     using Base::Base; // Inherit constructors
     void clear() override
     {
-        PYBIND11_OVERLOAD(void, Base, clear, );
+        PYBIND11_OVERRIDE(void, Base, clear, );
     }
     ncnn::VkBufferMemory* fastMalloc(size_t size) override
     {
-        PYBIND11_OVERLOAD_PURE(ncnn::VkBufferMemory*, Base, fastMalloc, size);
+        PYBIND11_OVERRIDE_PURE(ncnn::VkBufferMemory*, Base, fastMalloc, size);
     }
     void fastFree(ncnn::VkBufferMemory* ptr) override
     {
-        PYBIND11_OVERLOAD_PURE(void, Base, fastFree, ptr);
+        PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr);
     }
     int flush(ncnn::VkBufferMemory* ptr) override
     {
-        PYBIND11_OVERLOAD(int, Base, flush, ptr);
+        PYBIND11_OVERRIDE(int, Base, flush, ptr);
     }
     int invalidate(ncnn::VkBufferMemory* ptr) override
     {
-        PYBIND11_OVERLOAD(int, Base, invalidate, ptr);
+        PYBIND11_OVERRIDE(int, Base, invalidate, ptr);
     }
 };
 
@@ -83,15 +83,15 @@ class PyVkAllocatorOther : public PyVkAllocator<Other>
     using PyVkAllocator<Other>::PyVkAllocator;
     void clear() override
     {
-        PYBIND11_OVERLOAD(void, Other, clear, );
+        PYBIND11_OVERRIDE(void, Other, clear, );
     }
     ncnn::VkBufferMemory* fastMalloc(size_t size) override
     {
-        PYBIND11_OVERLOAD(ncnn::VkBufferMemory*, Other, fastMalloc, size);
+        PYBIND11_OVERRIDE(ncnn::VkBufferMemory*, Other, fastMalloc, size);
     }
     void fastFree(ncnn::VkBufferMemory* ptr) override
     {
-        PYBIND11_OVERLOAD(void, Other, fastFree, ptr);
+        PYBIND11_OVERRIDE(void, Other, fastFree, ptr);
     }
 };
 
@@ -102,17 +102,15 @@ class PyVkBlobAllocator : public Base
     using Base::Base; // Inherit constructors
     void clear() override
     {
-        PYBIND11_OVERLOAD(void, Base, clear, );
+        PYBIND11_OVERRIDE(void, Base, clear, );
     }
-    ncnn::VkImageMemory* fastMalloc(int width, int height,
-                                    VkFormat format) override
+    ncnn::VkImageMemory* fastMalloc(int width, int height, VkFormat format) override
     {
-        PYBIND11_OVERLOAD_PURE(ncnn::VkImageMemory*, Base, fastMalloc, width,
-                               height, format);
+        PYBIND11_OVERRIDE_PURE(ncnn::VkImageMemory*, Base, fastMalloc, width, height, format);
     }
     void fastFree(ncnn::VkImageMemory* ptr) override
     {
-        PYBIND11_OVERLOAD_PURE(void, Base, fastFree, ptr);
+        PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr);
     }
 };
 
@@ -124,14 +122,13 @@ class PyVkBlobAllocator : public Base
 //    ncnn::VkImageMemory* fastMalloc(int width, int height,
 //                                    VkFormat format) override
 //    {
-//        PYBIND11_OVERLOAD(ncnn::VkImageMemory*, Other, fastMalloc, width, height,
-//                          format);
+//        PYBIND11_OVERRIDE(ncnn::VkImageMemory*, Other, fastMalloc, width, height, format);
 //    }
 //    void fastFree(ncnn::VkImageMemory* ptr) override
 //    {
-//        PYBIND11_OVERLOAD(void, Other, fastFree, ptr);
+//        PYBIND11_OVERRIDE(void, Other, fastFree, ptr);
 //    }
 //};
 #endif // NCNN_VULKAN
 
-#endif
\ No newline at end of file
+#endif
diff --git a/python/src/pybind11_datareader.h b/python/src/pybind11_datareader.h
index e9ecfb8058e..63b67ee47c7 100644
--- a/python/src/pybind11_datareader.h
+++ b/python/src/pybind11_datareader.h
@@ -42,12 +42,12 @@ class PyDataReader : public Base
 #if NCNN_STRING
     int scan(const char* format, void* p) const override
     {
-        PYBIND11_OVERLOAD(int, Base, scan, format, p);
+        PYBIND11_OVERRIDE(int, Base, scan, format, p);
     }
 #endif // NCNN_STRING
     size_t read(void* buf, size_t size) const override
     {
-        PYBIND11_OVERLOAD(size_t, Base, read, buf, size);
+        PYBIND11_OVERRIDE(size_t, Base, read, buf, size);
     }
 };
 
@@ -59,13 +59,13 @@ class PyDataReaderOther : public PyDataReader<Other>
 #if NCNN_STRING
     int scan(const char* format, void* p) const override
     {
-        PYBIND11_OVERLOAD(int, Other, scan, format, p);
+        PYBIND11_OVERRIDE(int, Other, scan, format, p);
     }
 #endif // NCNN_STRING
     size_t read(void* buf, size_t size) const override
     {
-        PYBIND11_OVERLOAD(size_t, Other, read, buf, size);
+        PYBIND11_OVERRIDE(size_t, Other, read, buf, size);
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/python/src/pybind11_mat.h b/python/src/pybind11_mat.h
index 1a1d1f1c626..04663e829d0 100644
--- a/python/src/pybind11_mat.h
+++ b/python/src/pybind11_mat.h
@@ -48,59 +48,69 @@ std::string get_mat_format(const ncnn::Mat& m)
 // f (float)
 // d (double)
 // leave it to empty to use get_mat_format
-py::buffer_info to_buffer_info(ncnn::Mat &m, const std::string &format = "") {
-  if (m.elemsize != 1 && m.elemsize != 2 && m.elemsize != 4) {
-    std::stringstream ss;
-    ss << "convert ncnn.Mat to numpy.ndarray only elemsize 1, 2, 4 support "
-          "now, but given "
-       << m.elemsize;
-    pybind11::pybind11_fail(ss.str());
-  }
-  if (m.elempack != 1) {
-    std::stringstream ss;
-    ss << "convert ncnn.Mat to numpy.ndarray only elempack 1 support now, but "
-          "given "
-       << m.elempack;
-    pybind11::pybind11_fail(ss.str());
-  }
-  std::string _format(format);
-  if (_format.empty()) {
-    _format = get_mat_format(m);
-  }
-  std::vector<py::ssize_t> shape;
-  std::vector<py::ssize_t> strides;
-  if (m.dims == 1) {
-    shape.push_back(m.w);
-    strides.push_back(m.elemsize);
-  } else if (m.dims == 2) {
-    shape.push_back(m.h);
-    shape.push_back(m.w);
-    strides.push_back(m.w * m.elemsize);
-    strides.push_back(m.elemsize);
-  } else if (m.dims == 3) {
-    shape.push_back(m.c);
-    shape.push_back(m.h);
-    shape.push_back(m.w);
-    strides.push_back(m.cstep * m.elemsize);
-    strides.push_back(m.w * m.elemsize);
-    strides.push_back(m.elemsize);
-  } else if (m.dims == 4) {
-    shape.push_back(m.c);
-    shape.push_back(m.d);
-    shape.push_back(m.h);
-    shape.push_back(m.w);
-    strides.push_back(m.cstep * m.elemsize);
-    strides.push_back(m.w * m.h * m.elemsize);
-    strides.push_back(m.w * m.elemsize);
-    strides.push_back(m.elemsize);
-  }
-  return py::buffer_info(m.data,     /* Pointer to buffer */
-                         m.elemsize, /* Size of one scalar */
-                         _format,    /* Python struct-style format descriptor */
-                         m.dims,     /* Number of dimensions */
-                         shape,      /* Buffer dimensions */
-                         strides     /* Strides (in bytes) for each index */
-  );
+py::buffer_info to_buffer_info(ncnn::Mat& m, const std::string& format = "")
+{
+    if (m.elemsize != 1 && m.elemsize != 2 && m.elemsize != 4)
+    {
+        std::ostringstream ss;
+        ss << "Convert ncnn.Mat to numpy.ndarray. Support only elemsize 1, 2, 4; but given "
+           << m.elemsize;
+        py::pybind11_fail(ss.str());
+    }
+    if (m.elempack != 1)
+    {
+        std::ostringstream ss;
+        ss << "Convert ncnn.Mat to numpy.ndarray. Support only elempack == 1, but "
+           "given "
+           << m.elempack;
+        py::pybind11_fail(ss.str());
+    }
+    std::string _format(format);
+    if (_format.empty())
+    {
+        _format = get_mat_format(m);
+    }
+    std::vector<py::ssize_t> shape;
+    std::vector<py::ssize_t> strides;
+    if (m.dims == 1)
+    {
+        shape.push_back(m.w);
+        strides.push_back(m.elemsize);
+    }
+    else if (m.dims == 2)
+    {
+        shape.push_back(m.h);
+        shape.push_back(m.w);
+        strides.push_back(m.w * m.elemsize);
+        strides.push_back(m.elemsize);
+    }
+    else if (m.dims == 3)
+    {
+        shape.push_back(m.c);
+        shape.push_back(m.h);
+        shape.push_back(m.w);
+        strides.push_back(m.cstep * m.elemsize);
+        strides.push_back(m.w * m.elemsize);
+        strides.push_back(m.elemsize);
+    }
+    else if (m.dims == 4)
+    {
+        shape.push_back(m.c);
+        shape.push_back(m.d);
+        shape.push_back(m.h);
+        shape.push_back(m.w);
+        strides.push_back(m.cstep * m.elemsize);
+        strides.push_back(m.w * m.h * m.elemsize);
+        strides.push_back(m.w * m.elemsize);
+        strides.push_back(m.elemsize);
+    }
+    return py::buffer_info(m.data,     /* Pointer to buffer */
+                           m.elemsize, /* Size of one scalar */
+                           _format,    /* Python struct-style format descriptor */
+                           m.dims,     /* Number of dimensions */
+                           shape,      /* Buffer dimensions */
+                           strides     /* Strides (in bytes) for each index */
+                          );
 }
 
-#endif  // PYBIND11_NCNN_MAT_H
+#endif
diff --git a/python/src/pybind11_modelbin.h b/python/src/pybind11_modelbin.h
index 6b5e3a676ca..5f875061c88 100644
--- a/python/src/pybind11_modelbin.h
+++ b/python/src/pybind11_modelbin.h
@@ -25,13 +25,13 @@ class PyModelBin : public Base
     using Base::Base; // Inherit constructors
     ncnn::Mat load(int w, int type) const override
     {
-        PYBIND11_OVERLOAD_PURE(ncnn::Mat, Base, load, w, type);
+        PYBIND11_OVERRIDE_PURE(ncnn::Mat, Base, load, w, type);
     }
     //ncnn::Mat load(int w, int h, int type) const override {
-    //	PYBIND11_OVERLOAD(ncnn::Mat, Base, load, w, h, type);
+    //	PYBIND11_OVERRIDE(ncnn::Mat, Base, load, w, h, type);
     //}
     //ncnn::Mat load(int w, int h, int c, int type) const override {
-    //	PYBIND11_OVERLOAD(ncnn::Mat, Base, load, w, h, c, type);
+    //	PYBIND11_OVERRIDE(ncnn::Mat, Base, load, w, h, c, type);
     //}
 };
 
@@ -42,8 +42,8 @@ class PyModelBinOther : public PyModelBin<Other>
     using PyModelBin<Other>::PyModelBin;
     ncnn::Mat load(int w, int type) const override
     {
-        PYBIND11_OVERLOAD(ncnn::Mat, Other, load, w, type);
+        PYBIND11_OVERRIDE(ncnn::Mat, Other, load, w, type);
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/python/tests/test_mat.py b/python/tests/test_mat.py
index 605d59c3919..7019961a577 100644
--- a/python/tests/test_mat.py
+++ b/python/tests/test_mat.py
@@ -211,10 +211,11 @@ def test_mat_dims4():
 
 def test_numpy():
     mat = ncnn.Mat(1)
-    array = np.array(mat)
+    array = mat.numpy()
     assert mat.dims == array.ndim and mat.w == array.shape[0]
     mat = ncnn.Mat(2, 3)
-    array = np.array(mat)
+    array = mat.numpy()
+    assert array.dtype == np.float32
     assert (
         mat.dims == array.ndim and mat.w == array.shape[1] and mat.h == array.shape[0]
     )
@@ -237,10 +238,10 @@ def test_numpy():
     )
 
     mat = ncnn.Mat(1, elemsize=1)
-    array = np.array(mat)
+    array = mat.numpy()
     assert array.dtype == np.int8
     mat = ncnn.Mat(1, elemsize=2)
-    array = np.array(mat)
+    array = mat.numpy()
     assert array.dtype == np.float16
     # pybind11 def_buffer throw bug
     # with pytest.raises(RuntimeError) as execinfo:
@@ -251,7 +252,7 @@ def test_numpy():
     #     )
     assert array.dtype == np.float16
     mat = ncnn.Mat(1, elemsize=4)
-    array = np.array(mat)
+    array = mat.numpy()
     assert array.dtype == np.float32
 
     mat = np.random.randint(0, 128, size=(12,)).astype(np.uint8)
@@ -279,6 +280,19 @@ def test_numpy():
     array = np.array(mat)
     assert (mat == array).all()
 
+    array = np.array([1, 2, 3], dtype=np.int32)
+    mat = ncnn.Mat(array)
+    array2 = mat.numpy(format='i')
+    assert array2.dtype == np.int32
+    array[0] = 10
+    assert array2[0] == 10
+
+    array = np.array([1, 2, 3], dtype=np.float32)
+    mat = ncnn.Mat(array)
+    array2 = mat.numpy(format='f')
+    assert array2.dtype == np.float32
+    array2[0] = 100
+    assert array[0] == 100
 
 def test_fill():
     mat = ncnn.Mat(1)
diff --git a/setup.py b/setup.py
index 3c97205e453..89e78bf7764 100644
--- a/setup.py
+++ b/setup.py
@@ -67,6 +67,8 @@ def build_extension(self, ext):
             "-DPYTHON_EXECUTABLE={}".format(sys.executable),
             "-DCMAKE_BUILD_TYPE={}".format(cfg),  # not used on MSVC, but no harm
             "-DNCNN_PYTHON=ON",
+            "-DNCNN_DISABLE_RTTI=OFF",
+            "-DNCNN_DISABLE_EXCEPTION=OFF",
             "-DNCNN_BUILD_BENCHMARK=OFF",
             "-DNCNN_BUILD_EXAMPLES=OFF",
             "-DNCNN_BUILD_TOOLS=OFF",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index df423dfc0b7..832dfe4a8bd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -161,6 +161,9 @@ ncnn_add_layer(MakePadMask)
 ncnn_add_layer(RelShift)
 ncnn_add_layer(GLU)
 ncnn_add_layer(LSTM2)
+ncnn_add_layer(Fold)
+ncnn_add_layer(Unfold)
+ncnn_add_layer(GridSample)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
@@ -227,6 +230,8 @@ if(NCNN_OPENMP)
         elseif(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
             target_compile_options(ncnn PRIVATE -fopenmp)
             target_link_libraries(ncnn PUBLIC -fopenmp -static-openmp)
+            # see cpu.cpp __wrap___kmp_abort_process comment for the linker magic
+            target_link_libraries(ncnn PUBLIC -Wl,-wrap,__kmp_affinity_determine_capable)
         elseif(OpenMP_CXX_FOUND)
             target_link_libraries(ncnn PUBLIC OpenMP::OpenMP_CXX)
         else()
@@ -451,6 +456,12 @@ if(((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (APPLE AND CMAKE_OSX_AR
         endif()
     endif()
     target_compile_options(ncnn PRIVATE ${ARM_MARCH_FLAG})
+
+    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER_EQUAL 23))
+        # llvm 12 in ndk-23 enables out-of-line atomics by default
+        # disable this feature for fixing linking atomic builtins issue with old ndk
+        target_compile_options(ncnn PRIVATE -mno-outline-atomics)
+    endif()
 endif()
 
 if(NCNN_TARGET_ARCH STREQUAL "mips")
@@ -462,10 +473,18 @@ if(NCNN_TARGET_ARCH STREQUAL "mips")
     endif()
 endif()
 
+if(NCNN_TARGET_ARCH STREQUAL "loongarch")
+    if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX)
+        target_compile_options(ncnn PRIVATE -mlsx)
+    endif()
+endif()
+
 if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
     if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
-        if(NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
             target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh)
+        elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
+            target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16)
         elseif(NCNN_COMPILER_SUPPORT_RVV)
             target_compile_options(ncnn PRIVATE -march=rv64gcv)
         endif()
@@ -533,6 +552,3 @@ endif()
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 set_property(TARGET ncnn PROPERTY FOLDER "libncnn")
 set_property(TARGET ncnn-generate-spirv PROPERTY FOLDER "libncnn")
-
-add_executable(main main.cpp)
-target_link_libraries(main ncnn)
diff --git a/src/allocator.cpp b/src/allocator.cpp
index d14c81511c3..485d07951d0 100644
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -33,6 +33,7 @@ class PoolAllocatorPrivate
     Mutex budgets_lock;
     Mutex payouts_lock;
     unsigned int size_compare_ratio; // 0~256
+    size_t size_drop_threshold;
     std::list<std::pair<size_t, void*> > budgets;
     std::list<std::pair<size_t, void*> > payouts;
 };
@@ -40,7 +41,8 @@ class PoolAllocatorPrivate
 PoolAllocator::PoolAllocator()
     : Allocator(), d(new PoolAllocatorPrivate)
 {
-    d->size_compare_ratio = 192; // 0.75f * 256
+    d->size_compare_ratio = 0;
+    d->size_drop_threshold = 10;
 }
 
 PoolAllocator::~PoolAllocator()
@@ -99,12 +101,17 @@ void PoolAllocator::set_size_compare_ratio(float scr)
     d->size_compare_ratio = (unsigned int)(scr * 256);
 }
 
+void PoolAllocator::set_size_drop_threshold(size_t threshold)
+{
+    d->size_drop_threshold = threshold;
+}
+
 void* PoolAllocator::fastMalloc(size_t size)
 {
     d->budgets_lock.lock();
 
     // find free budget
-    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
+    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
     for (; it != d->budgets.end(); ++it)
     {
         size_t bs = it->first;
@@ -126,6 +133,35 @@ void* PoolAllocator::fastMalloc(size_t size)
 
             return ptr;
         }
+
+        if (bs < it_min->first)
+        {
+            it_min = it;
+        }
+        if (bs > it_max->first)
+        {
+            it_max = it;
+        }
+    }
+
+    if (d->budgets.size() >= d->size_drop_threshold)
+    {
+        // All chunks in pool are not chosen. Then try to drop some outdated
+        // chunks and return them to OS.
+        if (it_max->first < size)
+        {
+            // Current query is asking for a chunk larger than any cached chunks.
+            // Then remove the smallest one.
+            ncnn::fastFree(it_min->second);
+            d->budgets.erase(it_min);
+        }
+        else if (it_min->first > size)
+        {
+            // Current query is asking for a chunk smaller than any cached chunks.
+            // Then remove the largest one.
+            ncnn::fastFree(it_max->second);
+            d->budgets.erase(it_max);
+        }
     }
 
     d->budgets_lock.unlock();
@@ -178,6 +214,7 @@ class UnlockedPoolAllocatorPrivate
 {
 public:
     unsigned int size_compare_ratio; // 0~256
+    size_t size_drop_threshold;
     std::list<std::pair<size_t, void*> > budgets;
     std::list<std::pair<size_t, void*> > payouts;
 };
@@ -185,7 +222,8 @@ class UnlockedPoolAllocatorPrivate
 UnlockedPoolAllocator::UnlockedPoolAllocator()
     : Allocator(), d(new UnlockedPoolAllocatorPrivate)
 {
-    d->size_compare_ratio = 192; // 0.75f * 256
+    d->size_compare_ratio = 0;
+    d->size_drop_threshold = 10;
 }
 
 UnlockedPoolAllocator::~UnlockedPoolAllocator()
@@ -240,10 +278,15 @@ void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
     d->size_compare_ratio = (unsigned int)(scr * 256);
 }
 
+void UnlockedPoolAllocator::set_size_drop_threshold(size_t threshold)
+{
+    d->size_drop_threshold = threshold;
+}
+
 void* UnlockedPoolAllocator::fastMalloc(size_t size)
 {
     // find free budget
-    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
+    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
     for (; it != d->budgets.end(); ++it)
     {
         size_t bs = it->first;
@@ -259,6 +302,29 @@ void* UnlockedPoolAllocator::fastMalloc(size_t size)
 
             return ptr;
         }
+
+        if (bs > it_max->first)
+        {
+            it_max = it;
+        }
+        if (bs < it_min->first)
+        {
+            it_min = it;
+        }
+    }
+
+    if (d->budgets.size() >= d->size_drop_threshold)
+    {
+        if (it_max->first < size)
+        {
+            ncnn::fastFree(it_min->second);
+            d->budgets.erase(it_min);
+        }
+        else if (it_min->first > size)
+        {
+            ncnn::fastFree(it_max->second);
+            d->budgets.erase(it_max);
+        }
     }
 
     // new
diff --git a/src/allocator.h b/src/allocator.h
index c9fcf90d181..3a5ebcac56b 100644
--- a/src/allocator.h
+++ b/src/allocator.h
@@ -170,9 +170,13 @@ class NCNN_EXPORT PoolAllocator : public Allocator
     ~PoolAllocator();
 
     // ratio range 0 ~ 1
-    // default cr = 0.75
+    // default cr = 0
     void set_size_compare_ratio(float scr);
 
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
     // release all budgets immediately
     void clear();
 
@@ -195,9 +199,13 @@ class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
     ~UnlockedPoolAllocator();
 
     // ratio range 0 ~ 1
-    // default cr = 0.75
+    // default cr = 0
     void set_size_compare_ratio(float scr);
 
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
     // release all budgets immediately
     void clear();
 
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 9bb1ba1819b..516ceec7df4 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -138,8 +138,11 @@ ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator()
 
 void ncnn_allocator_destroy(ncnn_allocator_t allocator)
 {
-    delete (Allocator*)allocator->pthis;
-    free(allocator);
+    if (allocator)
+    {
+        delete (Allocator*)allocator->pthis;
+        free(allocator);
+    }
 }
 
 /* option api */
@@ -163,6 +166,26 @@ void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads)
     ((Option*)opt)->num_threads = num_threads;
 }
 
+int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt)
+{
+    return ((Option*)opt)->use_local_pool_allocator;
+}
+
+void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator)
+{
+    ((Option*)opt)->use_local_pool_allocator = use_local_pool_allocator;
+}
+
+void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator)
+{
+    ((Option*)opt)->blob_allocator = allocator ? (Allocator*)allocator->pthis : NULL;
+}
+
+void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator)
+{
+    ((Option*)opt)->workspace_allocator = allocator ? (Allocator*)allocator->pthis : NULL;
+}
+
 int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt)
 {
 #if NCNN_VULKAN
@@ -191,82 +214,82 @@ ncnn_mat_t ncnn_mat_create()
 
 ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, c, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, c, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, d, c, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, d, c, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, data, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, data, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, c, data, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, c, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, d, c, data, (size_t)4u, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, d, c, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, c, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, c, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, d, c, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, d, c, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, data, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, data, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, c, data, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, c, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(w, h, d, c, data, elemsize, elempack, (Allocator*)allocator));
+    return (ncnn_mat_t)(new Mat(w, h, d, c, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
 void ncnn_mat_destroy(ncnn_mat_t mat)
@@ -281,27 +304,27 @@ void ncnn_mat_fill_float(ncnn_mat_t mat, float v)
 
 ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->clone((Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->clone(allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, c, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, c, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, d, c, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, d, c, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 int ncnn_mat_get_dims(const ncnn_mat_t mat)
@@ -359,22 +382,22 @@ void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c)
 /* mat pixel api */
 ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(Mat::from_pixels(pixels, type, w, h, stride, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(Mat::from_pixels(pixels, type, w, h, stride, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(Mat::from_pixels_resize(pixels, type, w, h, stride, target_width, target_height, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(Mat::from_pixels_resize(pixels, type, w, h, stride, target_width, target_height, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi(pixels, type, w, h, stride, roix, roiy, roiw, roih, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi(pixels, type, w, h, stride, roix, roiy, roiw, roih, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator)
 {
-    return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi_resize(pixels, type, w, h, stride, roix, roiy, roiw, roih, target_width, target_height, (Allocator*)allocator)));
+    return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi_resize(pixels, type, w, h, stride, roix, roiy, roiw, roih, target_width, target_height, allocator ? (Allocator*)allocator->pthis : NULL)));
 }
 
 void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride)
@@ -1190,6 +1213,11 @@ void ncnn_net_destroy(ncnn_net_t net)
     free(net);
 }
 
+ncnn_option_t ncnn_net_get_option(ncnn_net_t net)
+{
+    return (ncnn_option_t)(&((Net*)(net->pthis))->opt);
+}
+
 void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt)
 {
     ((Net*)net->pthis)->opt = *((Option*)opt);
diff --git a/src/c_api.h b/src/c_api.h
index 39f872bdbb6..b7435f846ba 100644
--- a/src/c_api.h
+++ b/src/c_api.h
@@ -51,6 +51,12 @@ NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
 NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
 NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
 
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
 NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
 NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
 
@@ -265,6 +271,7 @@ struct __ncnn_net_t
 NCNN_EXPORT ncnn_net_t ncnn_net_create();
 NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
 
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
 NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
 
 #if NCNN_STRING
diff --git a/src/cpu.cpp b/src/cpu.cpp
index 197093d6dd2..85c65335ccc 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -42,6 +42,12 @@
 #include <emscripten/threading.h>
 #endif
 
+#if defined _WIN32 && !(defined __MINGW32__)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <powerbase.h>
+#endif
+
 #if defined __ANDROID__ || defined __linux__
 #if defined __ANDROID__
 #if __ANDROID_API__ >= 18
@@ -88,10 +94,18 @@
 #ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
 #define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
 #endif
+// A16
+#ifndef CPUFAMILY_ARM_EVEREST_SAWTOOTH
+#define CPUFAMILY_ARM_EVEREST_SAWTOOTH 0x8765edea
+#endif
 // M1
 #ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM
 #define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3
 #endif
+// M2
+#ifndef CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD
+#define CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD 0xda33d83d
+#endif
 #endif // __APPLE__
 
 #if defined(__SSE3__)
@@ -159,7 +173,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
         return 0;
     }
 
-#if __aarch64__ || __mips64 || __riscv_xlen == 64
+#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
     struct
     {
         uint64_t tag;
@@ -236,6 +250,12 @@ static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
 #define HWCAP_LOONGSON_MMI (1 << 11)
 #endif
 
+#if __loongarch64
+// from arch/loongarch/include/uapi/asm/hwcap.h
+#define HWCAP_LOONGARCH_LSX  (1 << 4)
+#define HWCAP_LOONGARCH_LASX (1 << 5)
+#endif
+
 #if __riscv
 // from arch/riscv/include/uapi/asm/hwcap.h
 #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
@@ -272,9 +292,60 @@ static cpu_subtype_t get_hw_cpusubtype()
 static unsigned int g_hw_cpufamily = get_hw_cpufamily();
 static cpu_type_t g_hw_cputype = get_hw_cputype();
 static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
+
+static int get_hw_capability(const char* cap)
+{
+    int64_t value = 0;
+    size_t len = sizeof(value);
+    sysctlbyname(cap, &value, &len, NULL, 0);
+    return value;
+}
+
+static int g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16");
+static int g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd");
+static int g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM");
+static int g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16");
+static int g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");
 #endif // __APPLE__
 
-#if defined __ANDROID__ || defined __linux__
+#if (defined _WIN32 && !(defined __MINGW32__))
+CpuSet::CpuSet()
+{
+    disable_all();
+}
+
+void CpuSet::enable(int cpu)
+{
+    mask |= (1 << cpu);
+}
+
+void CpuSet::disable(int cpu)
+{
+    mask &= ~(1 << cpu);
+}
+
+void CpuSet::disable_all()
+{
+    mask = 0;
+}
+
+bool CpuSet::is_enabled(int cpu) const
+{
+    return mask & (1 << cpu);
+}
+
+int CpuSet::num_enabled() const
+{
+    int num_enabled = 0;
+    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
+    {
+        if (is_enabled(i))
+            num_enabled++;
+    }
+
+    return num_enabled;
+}
+#elif defined __ANDROID__ || defined __linux__
 CpuSet::CpuSet()
 {
     disable_all();
@@ -444,7 +515,13 @@ int cpu_support_arm_asimdhp()
 #endif
 #elif __APPLE__
 #if __aarch64__
-    return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
+    return g_hw_optional_arm_FEAT_FP16
+           || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL
+           || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
+           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
+           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
 #else
     return 0;
 #endif
@@ -463,7 +540,11 @@ int cpu_support_arm_asimddp()
 #endif
 #elif __APPLE__
 #if __aarch64__
-    return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
+    return g_hw_optional_arm_FEAT_DotProd
+           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
+           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
 #else
     return 0;
 #endif
@@ -482,7 +563,11 @@ int cpu_support_arm_asimdfhm()
 #endif
 #elif __APPLE__
 #if __aarch64__
-    return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
+    return g_hw_optional_arm_FEAT_FHM
+           || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
+           || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
 #else
     return 0;
 #endif
@@ -501,7 +586,9 @@ int cpu_support_arm_bf16()
 #endif
 #elif __APPLE__
 #if __aarch64__
-    return 0; // no known apple cpu support armv8.6 bf16
+    return g_hw_optional_arm_FEAT_BF16
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
 #else
     return 0;
 #endif
@@ -520,7 +607,9 @@ int cpu_support_arm_i8mm()
 #endif
 #elif __APPLE__
 #if __aarch64__
-    return 0; // no known apple cpu support armv8.6 i8mm
+    return g_hw_optional_arm_FEAT_I8MM
+           || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+           || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH;
 #else
     return 0;
 #endif
@@ -1001,6 +1090,32 @@ int cpu_support_mips_msa()
 #endif
 }
 
+int cpu_support_loongarch_lsx()
+{
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LSX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
+int cpu_support_loongarch_lasx()
+{
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LASX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
 int cpu_support_loongson_mmi()
 {
 #if defined __ANDROID__ || defined __linux__
@@ -1069,6 +1184,10 @@ static int get_cpucount()
         count = emscripten_num_logical_cores();
     else
         count = 1;
+#elif (defined _WIN32 && !(defined __MINGW32__))
+    SYSTEM_INFO system_info;
+    GetSystemInfo(&system_info);
+    count = system_info.dwNumberOfProcessors;
 #elif defined __ANDROID__ || defined __linux__
     // get cpu count from /proc/cpuinfo
     FILE* fp = fopen("/proc/cpuinfo", "rb");
@@ -1124,6 +1243,220 @@ int get_big_cpu_count()
     return big_cpu_count ? big_cpu_count : g_cpucount;
 }
 
+#if defined __ANDROID__ || defined __linux__
+static int get_thread_siblings(int cpuid)
+{
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);
+
+    FILE* fp = fopen(path, "rb");
+    if (!fp)
+        return -1;
+
+    int thread_siblings = -1;
+    int nscan = fscanf(fp, "%x", &thread_siblings);
+    if (nscan != 1)
+    {
+        // ignore
+    }
+
+    fclose(fp);
+
+    return thread_siblings;
+}
+#endif // defined __ANDROID__ || defined __linux__
+
+static int get_physical_cpucount()
+{
+    int count = 0;
+#if (defined _WIN32 && !(defined __MINGW32__))
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi == NULL)
+    {
+        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
+        return g_cpucount;
+    }
+
+    DWORD return_length = 0;
+    glpi(NULL, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+    glpi(buffer, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+    DWORD byte_offset = 0;
+    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+    {
+        if (ptr->Relationship == RelationProcessorCore)
+        {
+            count++;
+        }
+
+        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
+
+    free(buffer);
+#elif defined __ANDROID__ || defined __linux__
+    std::vector<int> thread_set;
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        int thread_siblings = get_thread_siblings(i);
+        if (thread_siblings == -1)
+        {
+            // ignore malformed one
+            continue;
+        }
+
+        bool thread_siblings_exists = false;
+        for (size_t j = 0; j < thread_set.size(); j++)
+        {
+            if (thread_set[j] == thread_siblings)
+            {
+                thread_siblings_exists = true;
+                break;
+            }
+        }
+
+        if (!thread_siblings_exists)
+        {
+            thread_set.push_back(thread_siblings);
+            count++;
+        }
+    }
+#elif __APPLE__
+    size_t len = sizeof(count);
+    sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
+#else
+    count = g_cpucount;
+#endif
+
+    if (count > g_cpucount)
+        count = g_cpucount;
+
+    return count;
+}
+
+static int g_physical_cpucount = get_physical_cpucount();
+
+int get_physical_cpu_count()
+{
+    return g_physical_cpucount;
+}
+
+int get_physical_little_cpu_count()
+{
+    if (g_physical_cpucount == g_cpucount)
+        return get_little_cpu_count();
+
+    return g_physical_cpucount * 2 - g_cpucount;
+}
+
+int get_physical_big_cpu_count()
+{
+    if (g_physical_cpucount == g_cpucount)
+        return get_big_cpu_count();
+
+    return g_cpucount - g_physical_cpucount;
+}
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+static CpuSet get_smt_cpu_mask()
+{
+    CpuSet smt_cpu_mask;
+
+    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+    if (glpi == NULL)
+    {
+        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
+        return smt_cpu_mask;
+    }
+
+    DWORD return_length = 0;
+    glpi(NULL, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
+    glpi(buffer, &return_length);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
+    DWORD byte_offset = 0;
+    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
+    {
+        if (ptr->Relationship == RelationProcessorCore)
+        {
+            CpuSet smt_set;
+            smt_set.mask = ptr->ProcessorMask;
+            if (smt_set.num_enabled() > 1)
+            {
+                // this core is smt
+                smt_cpu_mask.mask |= smt_set.mask;
+            }
+        }
+
+        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
+
+    free(buffer);
+
+    return smt_cpu_mask;
+}
+
+static std::vector<int> get_max_freq_mhz()
+{
+    typedef struct _PROCESSOR_POWER_INFORMATION
+    {
+        ULONG Number;
+        ULONG MaxMhz;
+        ULONG CurrentMhz;
+        ULONG MhzLimit;
+        ULONG MaxIdleState;
+        ULONG CurrentIdleState;
+    } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
+
+    HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));
+
+    typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
+    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
+    if (cnpi == NULL)
+    {
+        NCNN_LOGE("CallNtPowerInformation is not supported");
+        FreeLibrary(powrprof);
+        return std::vector<int>(g_cpucount, 0);
+    }
+
+    DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
+    PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);
+
+    cnpi(ProcessorInformation, NULL, 0, buffer, return_length);
+
+    std::vector<int> ret;
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        ULONG max_mhz = buffer[i].MaxMhz;
+        ret.push_back(max_mhz);
+    }
+
+    free(buffer);
+    FreeLibrary(powrprof);
+    return ret;
+}
+
+static int set_sched_affinity(const CpuSet& thread_affinity_mask)
+{
+    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
+    if (prev_mask == 0)
+    {
+        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
+        return -1;
+    }
+
+    return 0;
+}
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+
 #if defined __ANDROID__ || defined __linux__
 static int get_max_freq_khz(int cpuid)
 {
@@ -1199,6 +1532,39 @@ static int get_max_freq_khz(int cpuid)
     return max_freq_khz;
 }
 
+static bool is_smt_cpu(int cpuid)
+{
+    // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
+    char path[256];
+    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);
+
+    FILE* fp = fopen(path, "rb");
+
+    if (!fp)
+    {
+        sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
+        fp = fopen(path, "rb");
+
+        if (!fp)
+            return false;
+    }
+
+    bool is_smt = false;
+    while (!feof(fp))
+    {
+        char ch = fgetc(fp);
+        if (ch == ',' || ch == '-')
+        {
+            is_smt = true;
+            break;
+        }
+    }
+
+    fclose(fp);
+
+    return is_smt;
+}
+
 static int set_sched_affinity(const CpuSet& thread_affinity_mask)
 {
     // set affinity for thread
@@ -1289,7 +1655,48 @@ static int setup_thread_affinity_masks()
 {
     g_thread_affinity_mask_all.disable_all();
 
-#if defined __ANDROID__ || defined __linux__
+#if (defined _WIN32 && !(defined __MINGW32__))
+    // get max freq mhz for all cores
+    int max_freq_mhz_min = INT_MAX;
+    int max_freq_mhz_max = 0;
+    std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        int max_freq_mhz = cpu_max_freq_mhz[i];
+
+        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);
+
+        if (max_freq_mhz > max_freq_mhz_max)
+            max_freq_mhz_max = max_freq_mhz;
+        if (max_freq_mhz < max_freq_mhz_min)
+            max_freq_mhz_min = max_freq_mhz;
+    }
+
+    int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
+    if (max_freq_mhz_medium == max_freq_mhz_max)
+    {
+        g_thread_affinity_mask_little.disable_all();
+        g_thread_affinity_mask_big = g_thread_affinity_mask_all;
+        return 0;
+    }
+
+    CpuSet smt_cpu_mask = get_smt_cpu_mask();
+
+    for (int i = 0; i < g_cpucount; i++)
+    {
+        if (smt_cpu_mask.is_enabled(i))
+        {
+            // always treat smt core as big core
+            g_thread_affinity_mask_big.enable(i);
+            continue;
+        }
+
+        if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
+            g_thread_affinity_mask_little.enable(i);
+        else
+            g_thread_affinity_mask_big.enable(i);
+    }
+#elif defined __ANDROID__ || defined __linux__
     int max_freq_khz_min = INT_MAX;
     int max_freq_khz_max = 0;
     std::vector<int> cpu_max_freq_khz(g_cpucount);
@@ -1297,7 +1704,7 @@ static int setup_thread_affinity_masks()
     {
         int max_freq_khz = get_max_freq_khz(i);
 
-        //         NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
+        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
 
         cpu_max_freq_khz[i] = max_freq_khz;
 
@@ -1317,6 +1724,13 @@ static int setup_thread_affinity_masks()
 
     for (int i = 0; i < g_cpucount; i++)
     {
+        if (is_smt_cpu(i))
+        {
+            // always treat smt core as big core
+            g_thread_affinity_mask_big.enable(i);
+            continue;
+        }
+
         if (cpu_max_freq_khz[i] < max_freq_khz_medium)
             g_thread_affinity_mask_little.enable(i);
         else
@@ -1324,6 +1738,7 @@ static int setup_thread_affinity_masks()
     }
 #elif __APPLE__
     // affinity info from cpu model
+    // TODO find a general way to get per-core frequency on macos
     if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
     {
         // 2 + 4
@@ -1334,11 +1749,16 @@ static int setup_thread_affinity_masks()
         g_thread_affinity_mask_little.enable(4);
         g_thread_affinity_mask_little.enable(5);
     }
-    else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD)
+    else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST
+             || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER
+             || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM
+             || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD
+             || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH)
     {
-        // 2 + 4 or 4 + 4
-        if (get_cpu_count() == 6)
+        int cpu_count = get_cpu_count();
+        if (cpu_count == 6)
         {
+            // 2 + 4
             g_thread_affinity_mask_big.enable(0);
             g_thread_affinity_mask_big.enable(1);
             g_thread_affinity_mask_little.enable(2);
@@ -1346,8 +1766,9 @@ static int setup_thread_affinity_masks()
             g_thread_affinity_mask_little.enable(4);
             g_thread_affinity_mask_little.enable(5);
         }
-        else
+        else if (cpu_count == 8)
         {
+            // 4 + 4
             g_thread_affinity_mask_big.enable(0);
             g_thread_affinity_mask_big.enable(1);
             g_thread_affinity_mask_big.enable(2);
@@ -1357,6 +1778,42 @@ static int setup_thread_affinity_masks()
             g_thread_affinity_mask_little.enable(6);
             g_thread_affinity_mask_little.enable(7);
         }
+        else if (cpu_count == 10)
+        {
+            // 8 + 2
+            g_thread_affinity_mask_big.enable(0);
+            g_thread_affinity_mask_big.enable(1);
+            g_thread_affinity_mask_big.enable(2);
+            g_thread_affinity_mask_big.enable(3);
+            g_thread_affinity_mask_big.enable(4);
+            g_thread_affinity_mask_big.enable(5);
+            g_thread_affinity_mask_big.enable(6);
+            g_thread_affinity_mask_big.enable(7);
+            g_thread_affinity_mask_little.enable(8);
+            g_thread_affinity_mask_little.enable(9);
+        }
+        else if (cpu_count == 20)
+        {
+            // 16 + 4
+            g_thread_affinity_mask_big.enable(0);
+            g_thread_affinity_mask_big.enable(1);
+            g_thread_affinity_mask_big.enable(2);
+            g_thread_affinity_mask_big.enable(3);
+            g_thread_affinity_mask_big.enable(4);
+            g_thread_affinity_mask_big.enable(5);
+            g_thread_affinity_mask_big.enable(6);
+            g_thread_affinity_mask_big.enable(7);
+            g_thread_affinity_mask_big.enable(8);
+            g_thread_affinity_mask_big.enable(9);
+            g_thread_affinity_mask_big.enable(10);
+            g_thread_affinity_mask_big.enable(11);
+            g_thread_affinity_mask_big.enable(12);
+            g_thread_affinity_mask_big.enable(13);
+            g_thread_affinity_mask_big.enable(14);
+            g_thread_affinity_mask_big.enable(15);
+            g_thread_affinity_mask_little.enable(16);
+            g_thread_affinity_mask_little.enable(17);
+        }
     }
     else
     {
@@ -1394,7 +1851,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave)
 
 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
 {
-#if defined __ANDROID__ || defined __linux__
+#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
     int num_threads = thread_affinity_mask.num_enabled();
 
 #ifdef _OPENMP
@@ -1584,3 +2041,21 @@ int set_flush_denormals(int flush_denormals)
 }
 
 } // namespace ncnn
+
+#if defined __ANDROID__ && defined(_OPENMP) && __clang__
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __wrap___kmp_affinity_determine_capable(const char* /*env_var*/)
+{
+    // the internal affinity routines in llvm openmp call abort on __NR_sched_getaffinity / __NR_sched_setaffinity fails
+    // ref KMPNativeAffinity::get_system_affinity/set_system_affinity in openmp/runtime/src/kmp_affinity.h
+    // and cpu core goes offline in powersave mode on android, which triggers abort
+    // ATM there is no known api for controlling the abort behavior
+    // override __kmp_affinity_determine_capable with empty body to disable affinity regardless of KMP_AFFINITY env_var
+    // ugly hack works >.<    --- nihui
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif
diff --git a/src/cpu.h b/src/cpu.h
index 5a94106ef47..0f748f33d97 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -17,6 +17,10 @@
 
 #include <stddef.h>
 
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 #if defined __ANDROID__ || defined __linux__
 #include <sched.h> // cpu_set_t
 #endif
@@ -36,6 +40,9 @@ class NCNN_EXPORT CpuSet
     int num_enabled() const;
 
 public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
 #if defined __ANDROID__ || defined __linux__
     cpu_set_t cpu_set;
 #endif
@@ -93,6 +100,11 @@ NCNN_EXPORT int cpu_support_x86_avx512_bf16();
 // avx512_fp16 = x86 avx512 fp16
 NCNN_EXPORT int cpu_support_x86_avx512_fp16();
 
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
 // msa = mips mas
 NCNN_EXPORT int cpu_support_mips_msa();
 // mmi = loongson mmi
@@ -110,6 +122,10 @@ NCNN_EXPORT int get_cpu_count();
 NCNN_EXPORT int get_little_cpu_count();
 NCNN_EXPORT int get_big_cpu_count();
 
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
 // bind all threads on little clusters if powersave enabled
 // affects HMP arch cpu like ARM big.LITTLE
 // only implemented on android at the moment
diff --git a/src/layer.cpp b/src/layer.cpp
index 518b666ec23..953aebcd2bd 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -253,6 +253,13 @@ Layer* create_layer(int index)
     }
     else
 #endif // NCNN_RUNTIME_CPU && NCNN_AVX
+#if NCNN_RUNTIME_CPU && NCNN_LSX
+    if (ncnn::cpu_support_loongarch_lsx())
+    {
+        layer_creator = layer_registry_lsx[index].creator;
+    }
+    else
+#endif // NCNN_RUNTIME_CPU && NCNN_LSX
 #if NCNN_RUNTIME_CPU && NCNN_MSA
     if (ncnn::cpu_support_mips_msa())
     {
diff --git a/src/layer.h b/src/layer.h
index 46fed5e456c..d02f65bbca9 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -96,10 +96,9 @@ class NCNN_EXPORT Layer
     bool support_reserved_7;
     bool support_reserved_8;
     bool support_reserved_9;
-    bool support_reserved_10;
-    bool support_reserved_11;
-    bool support_reserved_12;
-    bool support_reserved_13;
+
+    // feature disabled set
+    int featmask;
 
 public:
     // implement inference
diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h
index 468e5eca3ef..aa8223d73f5 100644
--- a/src/layer/arm/cast_bf16.h
+++ b/src/layer/arm/cast_bf16.h
@@ -150,7 +150,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const
 
 static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
-#if NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
     if (ncnn::cpu_support_arm_bf16())
     {
         cast_bf16_to_fp32_neon_bf16(bottom_blob, top_blob, opt);
diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h
index bb326a97072..7e6db748aec 100644
--- a/src/layer/arm/cast_fp16.h
+++ b/src/layer/arm/cast_fp16.h
@@ -47,12 +47,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
         {
 #if __aarch64__
             asm volatile(
-                "prfm   pldl1keep, [%0, #512]   \n"
+                "prfm   pldl1keep, [%0, #512]       \n"
                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
-                "fcvtn  v0.4h, v0.4s            \n"
-                "fcvtn  v1.4h, v1.4s            \n"
-                "fcvtn  v2.4h, v2.4s            \n"
-                "fcvtn  v3.4h, v3.4s            \n"
+                "fcvtn  v0.4h, v0.4s                \n"
+                "fcvtn  v1.4h, v1.4s                \n"
+                "fcvtn  v2.4h, v2.4s                \n"
+                "fcvtn  v3.4h, v3.4s                \n"
                 "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -61,12 +61,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #512]      \n"
-                "vldm       %0!, {d0-d7}    \n"
-                "vcvt.f16.f32 d0, q0        \n"
-                "vcvt.f16.f32 d1, q1        \n"
-                "vcvt.f16.f32 d2, q2        \n"
-                "vcvt.f16.f32 d3, q3        \n"
+                "pld        [%0, #512]          \n"
+                "vldm       %0!, {d0-d7}        \n"
+                "vcvt.f16.f32 d0, q0            \n"
+                "vcvt.f16.f32 d1, q1            \n"
+                "vcvt.f16.f32 d2, q2            \n"
+                "vcvt.f16.f32 d3, q3            \n"
                 "vst1.u16   {d0-d3}, [%1 :128]! \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -77,24 +77,61 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
         }
         for (; i + 7 < size; i += 8)
         {
-            float32x4_t _p0_fp32 = vld1q_f32(ptr);
-            float32x4_t _p1_fp32 = vld1q_f32(ptr + 4);
-            float16x4_t _p0_fp16 = vcvt_f16_f32(_p0_fp32);
-            float16x4_t _p1_fp16 = vcvt_f16_f32(_p1_fp32);
-            uint16x8_t _p_fp16 = vcombine_u16(vreinterpret_u16_f16(_p0_fp16), vreinterpret_u16_f16(_p1_fp16));
-            vst1q_u16(outptr, _p_fp16);
-            ptr += 8;
-            outptr += 8;
+            // This is originally implemented with neon fp16 intrinsics.
+            // In the new version of gcc, __ARM_FP16_FORMAT_IEEE or __ARM_FP16_FORMAT_ALTERNATIVE needs to be defined to use the float16x4_t type.
+            // That leads to compiler error when compiled with -mfpu=neon-vfpv4 but without -mfp16-format=ieee flag.
+            // We could add more macro conditions to differentiate between old and new versions, but that's pretty ugly!
+            // Just use all inline assembly here ~
+            //          --- nihui
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4s, v1.4s}, [%0], #32   \n"
+                "fcvtn  v0.4h, v0.4s                \n"
+                "fcvtn  v1.4h, v1.4s                \n"
+                "st1    {v0.4h, v1.4h}, [%1], #16   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0", "v1");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.f32   {d0-d3}, [%0]!  \n"
+                "vcvt.f16.f32 d0, q0        \n"
+                "vcvt.f16.f32 d1, q1        \n"
+                "vst1.u16   {d0-d1}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1");
+#endif // __aarch64__
         }
         for (; i + 3 < size; i += 4)
         {
-            float32x4_t _p_fp32 = vld1q_f32(ptr);
-            float16x4_t _p_fp16 = vcvt_f16_f32(_p_fp32);
-            vst1_u16(outptr, vreinterpret_u16_f16(_p_fp16));
-            ptr += 4;
-            outptr += 4;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4s}, [%0], #16  \n"
+                "fcvtn  v0.4h, v0.4s        \n"
+                "st1    {v0.4h}, [%1], #8   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.f32   {d0-d1}, [%0]!  \n"
+                "vcvt.f16.f32 d0, q0        \n"
+                "vst1.u16   {d0}, [%1]!     \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0");
+#endif // __aarch64__
         }
-#endif
+#endif // (__ARM_FP & 2)
         for (; i < size; i++)
         {
             *outptr++ = float32_to_float16(*ptr++);
@@ -104,7 +141,7 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
 
 static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
-#if NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
+#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
     if (ncnn::cpu_support_arm_vfpv4())
     {
         cast_fp16_to_fp32_neon_vfpv4(bottom_blob, top_blob, opt);
@@ -132,12 +169,12 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
         {
 #if __aarch64__
             asm volatile(
-                "prfm   pldl1keep, [%0, #256]   \n"
+                "prfm   pldl1keep, [%0, #256]       \n"
                 "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
-                "fcvtl  v0.4s, v0.4h            \n"
-                "fcvtl  v1.4s, v1.4h            \n"
-                "fcvtl  v2.4s, v2.4h            \n"
-                "fcvtl  v3.4s, v3.4h            \n"
+                "fcvtl  v0.4s, v0.4h                \n"
+                "fcvtl  v1.4s, v1.4h                \n"
+                "fcvtl  v2.4s, v2.4h                \n"
+                "fcvtl  v3.4s, v3.4h                \n"
                 "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -146,13 +183,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #256]      \n"
+                "pld        [%0, #256]          \n"
                 "vld1.u16   {d4-d7}, [%0 :128]! \n"
-                "vcvt.f32.f16 q0, d4        \n"
-                "vcvt.f32.f16 q1, d5        \n"
-                "vcvt.f32.f16 q2, d6        \n"
-                "vcvt.f32.f16 q3, d7        \n"
-                "vstm       %1!, {d0-d7}    \n"
+                "vcvt.f32.f16 q0, d4            \n"
+                "vcvt.f32.f16 q1, d5            \n"
+                "vcvt.f32.f16 q2, d6            \n"
+                "vcvt.f32.f16 q3, d7            \n"
+                "vstm       %1!, {d0-d7}        \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
                 : "0"(ptr),
@@ -162,25 +199,55 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
         }
         for (; i + 7 < size; i += 8)
         {
-            uint16x8_t _p_fp16 = vld1q_u16(ptr);
-            float16x4_t _p0_fp16 = vreinterpret_f16_u16(vget_low_u16(_p_fp16));
-            float16x4_t _p1_fp16 = vreinterpret_f16_u16(vget_high_u16(_p_fp16));
-            float32x4_t _p0_fp32 = vcvt_f32_f16(_p0_fp16);
-            float32x4_t _p1_fp32 = vcvt_f32_f16(_p1_fp16);
-            vst1q_f32(outptr, _p0_fp32);
-            vst1q_f32(outptr + 4, _p1_fp32);
-            ptr += 8;
-            outptr += 8;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4h, v1.4h}, [%0], #16   \n"
+                "fcvtl  v0.4s, v0.4h                \n"
+                "fcvtl  v1.4s, v1.4h                \n"
+                "st1    {v0.4s, v1.4s}, [%1], #32   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0", "v1");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.u16   {d4-d5}, [%0]!  \n"
+                "vcvt.f32.f16 q0, d4        \n"
+                "vcvt.f32.f16 q1, d5        \n"
+                "vst1.f32   {d0-d3}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1", "q2");
+#endif // __aarch64__
         }
         for (; i + 3 < size; i += 4)
         {
-            float16x4_t _p_fp16 = vreinterpret_f16_u16(vld1_u16(ptr));
-            float32x4_t _p_fp32 = vcvt_f32_f16(_p_fp16);
-            vst1q_f32(outptr, _p_fp32);
-            ptr += 4;
-            outptr += 4;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4h}, [%0], #8   \n"
+                "fcvtl  v0.4s, v0.4h        \n"
+                "st1    {v0.4s}, [%1], #16  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.u16   {d2}, [%0]!     \n"
+                "vcvt.f32.f16 q0, d2        \n"
+                "vst1.f32   {d0-d1}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1");
+#endif // __aarch64__
         }
-#endif
+#endif // (__ARM_FP & 2)
         for (; i < size; i++)
         {
             *outptr++ = float16_to_float32(*ptr++);
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index 1f8f0c1cb48..f31ed1576ca 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -387,10 +387,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
                 // conv3x3s1_winograd63_transform_kernel_neon(weight_data, weight_winograd63_data, num_input, num_output, opt);
                 conv3x3s1_winograd63_transform_kernel_neon5(weight_data, weight_winograd63_data, num_input, num_output, opt);
             }
-            else
-            {
-                weight_data_tm = weight_data;
-            }
+
+            weight_data_tm = weight_data;
         }
         else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
         {
diff --git a/src/layer/arm/innerproduct_fp16s.h b/src/layer/arm/innerproduct_fp16s.h
index 18214bc91fe..31edd9ed64a 100644
--- a/src/layer/arm/innerproduct_fp16s.h
+++ b/src/layer/arm/innerproduct_fp16s.h
@@ -253,10 +253,10 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
             float32x4_t _val = vld1q_f32(sptr);
             uint16x8_t _w01 = vld1q_u16(kptr);
             uint16x8_t _w23 = vld1q_u16(kptr + 8);
-            float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-            float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
-            float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23)));
-            float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23)));
+            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
+            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
+            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
 #endif
 
 #if __aarch64__
@@ -281,7 +281,7 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
-            float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
             _sum0 = vfmaq_f32(_sum0, _val, _w);
 
@@ -410,10 +410,10 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
             float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr3));
 #else
             float32x4_t _val = vld1q_f32(sptr);
-            float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr0)));
-            float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr1)));
-            float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr2)));
-            float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr3)));
+            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr0)));
+            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr1)));
+            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr2)));
+            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr3)));
 #endif
 
             _sum0 = vfmaq_f32(_sum0, _val, _w0);
@@ -507,7 +507,7 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
             float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
             float32x4_t _val = vld1q_f32(sptr);
-            float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
             _sum = vfmaq_f32(_sum, _val, _w);
 
@@ -713,10 +713,10 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat
             {
                 // transpose 4x4
                 uint16x4x4_t _p;
-                _p.val[0] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k0)));
-                _p.val[1] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k1)));
-                _p.val[2] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k2)));
-                _p.val[3] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k3)));
+                _p.val[0] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k0)));
+                _p.val[1] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k1)));
+                _p.val[2] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k2)));
+                _p.val[3] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k3)));
                 vst4_u16(g0, _p);
 
                 k0 += 4;
diff --git a/src/layer/arm/innerproduct_gemm_fp16s.h b/src/layer/arm/innerproduct_gemm_fp16s.h
index 8e6731dc847..f7daa17da39 100644
--- a/src/layer/arm/innerproduct_gemm_fp16s.h
+++ b/src/layer/arm/innerproduct_gemm_fp16s.h
@@ -120,7 +120,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
                     float32x4_t _val = vld1q_f32(m);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
 #if __aarch64__
@@ -214,10 +214,10 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val = vld1q_f32(m);
                     uint16x8_t _w01 = vld1q_u16(kptr);
                     uint16x8_t _w23 = vld1q_u16(kptr + 8);
-                    float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-                    float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
-                    float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23)));
-                    float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23)));
+                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
+                    float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
+                    float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
 #endif
 
 #if __aarch64__
@@ -242,7 +242,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
                     _sum0 = vfmaq_f32(_sum0, _val, _w);
 
@@ -317,7 +317,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val1 = vld1q_f32(m + 4);
                     float32x4_t _val2 = vld1q_f32(m + 8);
                     float32x4_t _val3 = vld1q_f32(m + 12);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
 #if __aarch64__
@@ -414,8 +414,8 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val0 = vld1q_f32(m);
                     float32x4_t _val1 = vld1q_f32(m + 4);
                     uint16x8_t _w01 = vld1q_u16(kptr);
-                    float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-                    float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
+                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
 #endif
 
                     _sum0 = vfmaq_f32(_sum0, _val0, _w0);
@@ -433,7 +433,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
                     float32x4_t _val = vld1q_f32(m);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
                     _sum0 = vfmaq_f32(_sum0, _val, _w);
diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp
index 440c7bc8ce8..075da57aff8 100644
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -58,11 +58,11 @@ int LSTM_arm::create_pipeline(const Option& opt)
 
     // pack IFOG
     int num_directions = direction == 2 ? 2 : 1;
-    int size = weight_data_size / num_directions / num_output / 4;
+    int size = weight_data_size / num_directions / hidden_size / 4;
 
-    weight_xc_data_packed.create(size, num_output, num_directions, 16u, 4);
-    bias_c_data_packed.create(num_output, 1, num_directions, 16u, 4);
-    weight_hc_data_packed.create(num_output, num_output, num_directions, 16u, 4);
+    weight_xc_data_packed.create(size, hidden_size, num_directions, 16u, 4);
+    bias_c_data_packed.create(hidden_size, 1, num_directions, 16u, 4);
+    weight_hc_data_packed.create(num_output, hidden_size, num_directions, 16u, 4);
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int dr = 0; dr < num_directions; dr++)
@@ -82,7 +82,7 @@ int LSTM_arm::create_pipeline(const Option& opt)
 
         float* bias_c_IFOG = bias_c_data_packed_dr.row(0);
 
-        for (int q = 0; q < num_output; q++)
+        for (int q = 0; q < hidden_size; q++)
         {
             bias_c_IFOG[0] = bias_c_I[q];
             bias_c_IFOG[1] = bias_c_F[q];
@@ -91,15 +91,15 @@ int LSTM_arm::create_pipeline(const Option& opt)
 
             bias_c_IFOG += 4;
 
-            const float* weight_xc_I = weight_xc.row(num_output * 0 + q);
-            const float* weight_xc_F = weight_xc.row(num_output * 1 + q);
-            const float* weight_xc_O = weight_xc.row(num_output * 2 + q);
-            const float* weight_xc_G = weight_xc.row(num_output * 3 + q);
+            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
 
-            const float* weight_hc_I = weight_hc.row(num_output * 0 + q);
-            const float* weight_hc_F = weight_hc.row(num_output * 1 + q);
-            const float* weight_hc_O = weight_hc.row(num_output * 2 + q);
-            const float* weight_hc_G = weight_hc.row(num_output * 3 + q);
+            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
 
             float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q);
             float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q);
@@ -126,21 +126,37 @@ int LSTM_arm::create_pipeline(const Option& opt)
         }
     }
 
+    if (opt.lightmode)
+    {
+        weight_xc_data.release();
+        bias_c_data.release();
+        weight_hc_data.release();
+    }
+
     return 0;
 }
 
-static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
+static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
 {
     int size = bottom_blob.w;
     int T = bottom_blob.h;
 
     int num_output = top_blob.w;
+    int hidden_size = cell_state.w;
 
-    // 4 x num_output
-    Mat gates(4, num_output, 4u, opt.workspace_allocator);
+    // 4 x hidden_size
+    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
     if (gates.empty())
         return -100;
 
+    Mat tmp_hidden_state;
+    if (num_output != hidden_size)
+    {
+        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
+        if (tmp_hidden_state.empty())
+            return -100;
+    }
+
     // unroll
     for (int t = 0; t < T; t++)
     {
@@ -155,7 +171,7 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
 
         const float* x = bottom_blob.row(ti);
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
+        for (int q = 0; q < hidden_size; q++)
         {
             const float* bias_c_IFOG = (const float*)bias_c + q * 4;
 
@@ -291,14 +307,15 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
 
         float* cell_ptr = cell_state;
         float* hidden_ptr = hidden_state;
+        float* tmp_hidden_ptr = tmp_hidden_state;
 
-        int remain_num_output_start = 0;
+        int remain_hidden_size_start = 0;
 #if __ARM_NEON
-        int nn_num_output = num_output >> 2;
-        remain_num_output_start = nn_num_output << 2;
+        int nn_hidden_size = hidden_size >> 2;
+        remain_hidden_size_start = nn_hidden_size << 2;
 
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int qq = 0; qq < nn_num_output; qq++)
+        for (int qq = 0; qq < nn_hidden_size; qq++)
         {
             int q = qq * 4;
 
@@ -315,12 +332,20 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
             float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1q_f32(output_data + q, _H);
+
+            if (num_output == hidden_size)
+            {
+                vst1q_f32(hidden_ptr + q, _H);
+                vst1q_f32(output_data + q, _H);
+            }
+            else
+            {
+                vst1q_f32(tmp_hidden_ptr + q, _H);
+            }
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = remain_num_output_start; q < num_output; q++)
+        for (int q = remain_hidden_size_start; q < hidden_size; q++)
         {
             const float* gates_data = gates.row(q);
 
@@ -338,8 +363,43 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
             float H = O * tanh(cell2);
 
             cell_ptr[q] = cell2;
-            hidden_ptr[q] = H;
-            output_data[q] = H;
+            if (num_output == hidden_size)
+            {
+                hidden_ptr[q] = H;
+                output_data[q] = H;
+            }
+            else
+            {
+                tmp_hidden_ptr[q] = H;
+            }
+        }
+
+        if (num_output != hidden_size)
+        {
+            // int nn_num_output = num_output >> 2;
+            // int remain_num_output_start = nn_num_output << 2;
+            // #pragma omp parallel for num_threads(opt.num_threads)
+            // for (int qq = 0; qq < nn_num_output; qq++)
+            // {
+            //     int q = qq * 4;
+            //
+            // }
+            int remain_num_output_start = 0;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = remain_num_output_start; q < num_output; q++)
+            {
+                const float* hr = weight_hr.row(q);
+                const float* tmp_hidden_ptr = tmp_hidden_state;
+
+                float H = 0;
+                for (int i = 0; i < hidden_size; i++)
+                {
+                    H += tmp_hidden_ptr[i] * hr[i];
+                }
+
+                hidden_ptr[q] = H;
+                output_data[q] = H;
+            }
         }
     }
 
@@ -375,7 +435,7 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         return -100;
     hidden.fill(0.f);
 
-    Mat cell(num_output, 4u, opt.workspace_allocator);
+    Mat cell(hidden_size, 4u, opt.workspace_allocator);
     if (cell.empty())
         return -100;
     cell.fill(0.f);
@@ -387,7 +447,7 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -402,14 +462,14 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         if (top_blob_reverse.empty())
             return -100;
 
-        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret0 != 0)
             return ret0;
 
         hidden.fill(0.0f);
         cell.fill(0.0f);
 
-        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, cell, opt);
+        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -466,7 +526,7 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
             return -100;
         hidden.fill(0.f);
 
-        cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
+        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
         if (cell.empty())
             return -100;
         cell.fill(0.f);
@@ -480,7 +540,7 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -497,13 +557,13 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
 
         Mat hidden0 = hidden.row_range(0, 1);
         Mat cell0 = cell.row_range(0, 1);
-        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
+        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
         if (ret0 != 0)
             return ret0;
 
         Mat hidden1 = hidden.row_range(1, 1);
         Mat cell1 = cell.row_range(1, 1);
-        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
+        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -529,18 +589,27 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
 }
 
 #if NCNN_BF16
-static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
+static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
 {
     int size = bottom_blob.w;
     int T = bottom_blob.h;
 
     int num_output = top_blob.w;
+    int hidden_size = cell_state.w;
 
-    // 4 x num_output
-    Mat gates(4, num_output, 4u, opt.workspace_allocator);
+    // 4 x hidden_size
+    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
     if (gates.empty())
         return -100;
 
+    Mat tmp_hidden_state;
+    if (num_output != hidden_size)
+    {
+        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
+        if (tmp_hidden_state.empty())
+            return -100;
+    }
+
     // unroll
     for (int t = 0; t < T; t++)
     {
@@ -555,7 +624,7 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
         const unsigned short* x = bottom_blob.row<const unsigned short>(ti);
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
+        for (int q = 0; q < hidden_size; q++)
         {
             const unsigned short* bias_c_IFOG = (const unsigned short*)bias_c + q * 4;
 
@@ -693,14 +762,15 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
         float* cell_ptr = cell_state;
         float* hidden_ptr = hidden_state;
+        float* tmp_hidden_ptr = tmp_hidden_state;
 
-        int remain_num_output_start = 0;
+        int remain_hidden_size_start = 0;
 #if __ARM_NEON
-        int nn_num_output = num_output >> 2;
-        remain_num_output_start = nn_num_output << 2;
+        int nn_hidden_size = hidden_size >> 2;
+        remain_hidden_size_start = nn_hidden_size << 2;
 
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int qq = 0; qq < nn_num_output; qq++)
+        for (int qq = 0; qq < nn_hidden_size; qq++)
         {
             int q = qq * 4;
 
@@ -717,12 +787,20 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_u16(output_data + q, bfloat2float(_H));
+
+            if (num_output == hidden_size)
+            {
+                vst1q_f32(hidden_ptr + q, _H);
+                vst1_u16(output_data + q, bfloat2float(_H));
+            }
+            else
+            {
+                vst1q_f32(tmp_hidden_ptr + q, _H);
+            }
         }
 #endif // __ARM_NEON
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = remain_num_output_start; q < num_output; q++)
+        for (int q = remain_hidden_size_start; q < hidden_size; q++)
         {
             const float* gates_data = gates.row(q);
 
@@ -740,8 +818,43 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             float H = O * tanh(cell2);
 
             cell_ptr[q] = cell2;
-            hidden_ptr[q] = H;
-            output_data[q] = float32_to_bfloat16(H);
+            if (num_output == hidden_size)
+            {
+                hidden_ptr[q] = H;
+                output_data[q] = float32_to_bfloat16(H);
+            }
+            else
+            {
+                tmp_hidden_ptr[q] = H;
+            }
+        }
+
+        if (num_output != hidden_size)
+        {
+            // int nn_num_output = num_output >> 2;
+            // int remain_num_output_start = nn_num_output << 2;
+            // #pragma omp parallel for num_threads(opt.num_threads)
+            // for (int qq = 0; qq < nn_num_output; qq++)
+            // {
+            //     int q = qq * 4;
+            //
+            // }
+            int remain_num_output_start = 0;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = remain_num_output_start; q < num_output; q++)
+            {
+                const float* hr = weight_hr.row(q);
+                const float* tmp_hidden_ptr = tmp_hidden_state;
+
+                float H = 0;
+                for (int i = 0; i < hidden_size; i++)
+                {
+                    H += tmp_hidden_ptr[i] * hr[i];
+                }
+
+                hidden_ptr[q] = H;
+                output_data[q] = float32_to_bfloat16(H);
+            }
         }
     }
 
@@ -752,11 +865,11 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
 {
     // pack IFOG
     int num_directions = direction == 2 ? 2 : 1;
-    int size = weight_data_size / num_directions / num_output / 4;
+    int size = weight_data_size / num_directions / hidden_size / 4;
 
-    weight_xc_data_packed.create(size, num_output, num_directions, 8u, 4);
-    bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4);
-    weight_hc_data_packed.create(num_output, num_output, num_directions, 8u, 4);
+    weight_xc_data_packed.create(size, hidden_size, num_directions, 8u, 4);
+    bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4);
+    weight_hc_data_packed.create(num_output, hidden_size, num_directions, 8u, 4);
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int dr = 0; dr < num_directions; dr++)
@@ -776,7 +889,7 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
 
         unsigned short* bias_c_IFOG = bias_c_data_packed_dr.row<unsigned short>(0);
 
-        for (int q = 0; q < num_output; q++)
+        for (int q = 0; q < hidden_size; q++)
         {
             bias_c_IFOG[0] = float32_to_bfloat16(bias_c_I[q]);
             bias_c_IFOG[1] = float32_to_bfloat16(bias_c_F[q]);
@@ -785,15 +898,15 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
 
             bias_c_IFOG += 4;
 
-            const float* weight_xc_I = weight_xc.row(num_output * 0 + q);
-            const float* weight_xc_F = weight_xc.row(num_output * 1 + q);
-            const float* weight_xc_O = weight_xc.row(num_output * 2 + q);
-            const float* weight_xc_G = weight_xc.row(num_output * 3 + q);
+            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
 
-            const float* weight_hc_I = weight_hc.row(num_output * 0 + q);
-            const float* weight_hc_F = weight_hc.row(num_output * 1 + q);
-            const float* weight_hc_O = weight_hc.row(num_output * 2 + q);
-            const float* weight_hc_G = weight_hc.row(num_output * 3 + q);
+            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
 
             unsigned short* weight_xc_IFOG = weight_xc_data_packed_dr.row<unsigned short>(q);
             unsigned short* weight_hc_IFOG = weight_hc_data_packed_dr.row<unsigned short>(q);
@@ -820,6 +933,13 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
+    if (opt.lightmode)
+    {
+        weight_xc_data.release();
+        bias_c_data.release();
+        weight_hc_data.release();
+    }
+
     return 0;
 }
 
@@ -835,7 +955,7 @@ int LSTM_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option&
         return -100;
     hidden.fill(0.f);
 
-    Mat cell(num_output, 4u, opt.workspace_allocator);
+    Mat cell(hidden_size, 4u, opt.workspace_allocator);
     if (cell.empty())
         return -100;
     cell.fill(0.f);
@@ -847,7 +967,7 @@ int LSTM_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option&
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -862,14 +982,14 @@ int LSTM_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option&
         if (top_blob_reverse.empty())
             return -100;
 
-        int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret0 != 0)
             return ret0;
 
         hidden.fill(0.f);
         cell.fill(0.f);
 
-        int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, cell, opt);
+        int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -911,7 +1031,7 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
             return -100;
         hidden.fill(0.f);
 
-        cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
+        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
         if (cell.empty())
             return -100;
         cell.fill(0.f);
@@ -925,7 +1045,7 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -942,13 +1062,13 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
 
         Mat hidden0 = hidden.row_range(0, 1);
         Mat cell0 = cell.row_range(0, 1);
-        int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
+        int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
         if (ret0 != 0)
             return ret0;
 
         Mat hidden1 = hidden.row_range(1, 1);
         Mat cell1 = cell.row_range(1, 1);
-        int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
+        int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
         if (ret1 != 0)
             return ret1;
 
diff --git a/src/layer/arm/lstm_arm_asimdhp.cpp b/src/layer/arm/lstm_arm_asimdhp.cpp
index 1bdff266d59..f8029904c66 100644
--- a/src/layer/arm/lstm_arm_asimdhp.cpp
+++ b/src/layer/arm/lstm_arm_asimdhp.cpp
@@ -25,18 +25,27 @@
 namespace ncnn {
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
+static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
 {
     int size = bottom_blob.w;
     int T = bottom_blob.h;
 
     int num_output = top_blob.w;
+    int hidden_size = cell_state.w;
 
-    // 4 x num_output
-    Mat gates(4, num_output, 4u, opt.workspace_allocator);
+    // 4 x hidden_size
+    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
     if (gates.empty())
         return -100;
 
+    Mat tmp_hidden_state;
+    if (num_output != hidden_size)
+    {
+        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
+        if (tmp_hidden_state.empty())
+            return -100;
+    }
+
     // unroll
     for (int t = 0; t < T; t++)
     {
@@ -51,7 +60,7 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
         const __fp16* x = bottom_blob.row<const __fp16>(ti);
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
+        for (int q = 0; q < hidden_size; q++)
         {
             const __fp16* bias_c_IFOG = (const __fp16*)bias_c + q * 4;
 
@@ -141,11 +150,12 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
         float* cell_ptr = cell_state;
         float* hidden_ptr = hidden_state;
+        float* tmp_hidden_ptr = tmp_hidden_state;
 
-        int nn_num_output = num_output >> 2;
-        int remain_num_output_start = nn_num_output << 2;
+        int nn_hidden_size = hidden_size >> 2;
+        int remain_hidden_size_start = nn_hidden_size << 2;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int qq = 0; qq < nn_num_output; qq++)
+        for (int qq = 0; qq < nn_hidden_size; qq++)
         {
             int q = qq * 4;
 
@@ -162,11 +172,19 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_f16(output_data + q, vcvt_f16_f32(_H));
+
+            if (num_output == hidden_size)
+            {
+                vst1q_f32(hidden_ptr + q, _H);
+                vst1_f16(output_data + q, vcvt_f16_f32(_H));
+            }
+            else
+            {
+                vst1q_f32(tmp_hidden_ptr + q, _H);
+            }
         }
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = remain_num_output_start; q < num_output; q++)
+        for (int q = remain_hidden_size_start; q < hidden_size; q++)
         {
             const float* gates_data = gates.row(q);
 
@@ -184,26 +202,70 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             float H = O * tanh(cell2);
 
             cell_ptr[q] = cell2;
-            hidden_ptr[q] = H;
-            output_data[q] = (__fp16)(H);
+            if (num_output == hidden_size)
+            {
+                hidden_ptr[q] = H;
+                output_data[q] = (__fp16)H;
+            }
+            else
+            {
+                tmp_hidden_ptr[q] = H;
+            }
+        }
+
+        if (num_output != hidden_size)
+        {
+            // int nn_num_output = num_output >> 2;
+            // int remain_num_output_start = nn_num_output << 2;
+            // #pragma omp parallel for num_threads(opt.num_threads)
+            // for (int qq = 0; qq < nn_num_output; qq++)
+            // {
+            //     int q = qq * 4;
+            //
+            // }
+            int remain_num_output_start = 0;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = remain_num_output_start; q < num_output; q++)
+            {
+                const float* hr = weight_hr.row(q);
+                const float* tmp_hidden_ptr = tmp_hidden_state;
+
+                float H = 0;
+                for (int i = 0; i < hidden_size; i++)
+                {
+                    H += tmp_hidden_ptr[i] * hr[i];
+                }
+
+                hidden_ptr[q] = H;
+                output_data[q] = (__fp16)H;
+            }
         }
     }
 
     return 0;
 }
 
-static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
+static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
 {
     int size = bottom_blob.w;
     int T = bottom_blob.h;
 
     int num_output = top_blob.w;
+    int hidden_size = cell_state.w;
 
-    // 4 x num_output
-    Mat gates(4, num_output, 2u, opt.workspace_allocator);
+    // 4 x hidden_size
+    Mat gates(4, hidden_size, 2u, opt.workspace_allocator);
     if (gates.empty())
         return -100;
 
+    Mat tmp_hidden_state;
+    if (num_output != hidden_size)
+    {
+        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
+        if (tmp_hidden_state.empty())
+            return -100;
+    }
+
     // unroll
     for (int t = 0; t < T; t++)
     {
@@ -216,10 +278,10 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
         int ti = reverse ? T - 1 - t : t;
 
-        int nn_num_output = num_output >> 1;
-        int remain_num_output_start = nn_num_output << 1;
+        int nn_hidden_size = hidden_size >> 1;
+        int remain_hidden_size_start = nn_hidden_size << 1;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int qq = 0; qq < nn_num_output; qq++)
+        for (int qq = 0; qq < nn_hidden_size; qq++)
         {
             int q = qq * 2;
 
@@ -319,7 +381,7 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             vst1q_f16(gates_data, _IFOG);
         }
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = remain_num_output_start; q < num_output; q++)
+        for (int q = remain_hidden_size_start; q < hidden_size; q++)
         {
             const __fp16* bias_c_IFOG = (const __fp16*)bias_c + q * 4;
 
@@ -428,11 +490,12 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
 
         float* cell_ptr = cell_state;
         float* hidden_ptr = hidden_state;
+        float* tmp_hidden_ptr = tmp_hidden_state;
 
-        nn_num_output = num_output >> 2;
-        remain_num_output_start = nn_num_output << 2;
+        nn_hidden_size = hidden_size >> 2;
+        remain_hidden_size_start = nn_hidden_size << 2;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int qq = 0; qq < nn_num_output; qq++)
+        for (int qq = 0; qq < nn_hidden_size; qq++)
         {
             int q = qq * 4;
 
@@ -449,11 +512,19 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2));
 
             vst1q_f32(cell_ptr + q, _cell2);
-            vst1q_f32(hidden_ptr + q, _H);
-            vst1_f16(output_data + q, vcvt_f16_f32(_H));
+
+            if (num_output == hidden_size)
+            {
+                vst1q_f32(hidden_ptr + q, _H);
+                vst1_f16(output_data + q, vcvt_f16_f32(_H));
+            }
+            else
+            {
+                vst1q_f32(tmp_hidden_ptr + q, _H);
+            }
         }
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = remain_num_output_start; q < num_output; q++)
+        for (int q = remain_hidden_size_start; q < hidden_size; q++)
         {
             const __fp16* gates_data = gates.row<const __fp16>(q);
 
@@ -471,8 +542,43 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             float H = O * tanh(cell2);
 
             cell_ptr[q] = cell2;
-            hidden_ptr[q] = H;
-            output_data[q] = (__fp16)H;
+            if (num_output == hidden_size)
+            {
+                hidden_ptr[q] = H;
+                output_data[q] = (__fp16)H;
+            }
+            else
+            {
+                tmp_hidden_ptr[q] = H;
+            }
+        }
+
+        if (num_output != hidden_size)
+        {
+            // int nn_num_output = num_output >> 2;
+            // int remain_num_output_start = nn_num_output << 2;
+            // #pragma omp parallel for num_threads(opt.num_threads)
+            // for (int qq = 0; qq < nn_num_output; qq++)
+            // {
+            //     int q = qq * 4;
+            //
+            // }
+            int remain_num_output_start = 0;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = remain_num_output_start; q < num_output; q++)
+            {
+                const float* hr = weight_hr.row(q);
+                const float* tmp_hidden_ptr = tmp_hidden_state;
+
+                float H = 0;
+                for (int i = 0; i < hidden_size; i++)
+                {
+                    H += tmp_hidden_ptr[i] * hr[i];
+                }
+
+                hidden_ptr[q] = H;
+                output_data[q] = (__fp16)H;
+            }
         }
     }
 
@@ -483,19 +589,19 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
 {
     // pack IFOG
     int num_directions = direction == 2 ? 2 : 1;
-    int size = weight_data_size / num_directions / num_output / 4;
+    int size = weight_data_size / num_directions / hidden_size / 4;
 
     if (opt.use_fp16_arithmetic)
     {
-        weight_xc_data_packed.create(size, num_output / 2 + num_output % 2, num_directions, 16u, 8);
-        bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4);
-        weight_hc_data_packed.create(num_output, num_output / 2 + num_output % 2, num_directions, 16u, 8);
+        weight_xc_data_packed.create(size, hidden_size / 2 + hidden_size % 2, num_directions, 16u, 8);
+        bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4);
+        weight_hc_data_packed.create(num_output, hidden_size / 2 + hidden_size % 2, num_directions, 16u, 8);
     }
     else
     {
-        weight_xc_data_packed.create(size, num_output, num_directions, 8u, 4);
-        bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4);
-        weight_hc_data_packed.create(num_output, num_output, num_directions, 8u, 4);
+        weight_xc_data_packed.create(size, hidden_size, num_directions, 8u, 4);
+        bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4);
+        weight_hc_data_packed.create(num_output, hidden_size, num_directions, 8u, 4);
     }
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -519,7 +625,7 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
         if (opt.use_fp16_arithmetic)
         {
             int q = 0;
-            for (; q + 1 < num_output; q += 2)
+            for (; q + 1 < hidden_size; q += 2)
             {
                 bias_c_IFOG[0] = (__fp16)bias_c_I[q];
                 bias_c_IFOG[1] = (__fp16)bias_c_F[q];
@@ -532,23 +638,23 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
 
                 bias_c_IFOG += 8;
 
-                const float* weight_xc_I = weight_xc.row(num_output * 0 + q);
-                const float* weight_xc_F = weight_xc.row(num_output * 1 + q);
-                const float* weight_xc_O = weight_xc.row(num_output * 2 + q);
-                const float* weight_xc_G = weight_xc.row(num_output * 3 + q);
-                const float* weight_xc_I_1 = weight_xc.row(num_output * 0 + q + 1);
-                const float* weight_xc_F_1 = weight_xc.row(num_output * 1 + q + 1);
-                const float* weight_xc_O_1 = weight_xc.row(num_output * 2 + q + 1);
-                const float* weight_xc_G_1 = weight_xc.row(num_output * 3 + q + 1);
-
-                const float* weight_hc_I = weight_hc.row(num_output * 0 + q);
-                const float* weight_hc_F = weight_hc.row(num_output * 1 + q);
-                const float* weight_hc_O = weight_hc.row(num_output * 2 + q);
-                const float* weight_hc_G = weight_hc.row(num_output * 3 + q);
-                const float* weight_hc_I_1 = weight_hc.row(num_output * 0 + q + 1);
-                const float* weight_hc_F_1 = weight_hc.row(num_output * 1 + q + 1);
-                const float* weight_hc_O_1 = weight_hc.row(num_output * 2 + q + 1);
-                const float* weight_hc_G_1 = weight_hc.row(num_output * 3 + q + 1);
+                const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+                const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+                const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+                const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
+                const float* weight_xc_I_1 = weight_xc.row(hidden_size * 0 + q + 1);
+                const float* weight_xc_F_1 = weight_xc.row(hidden_size * 1 + q + 1);
+                const float* weight_xc_O_1 = weight_xc.row(hidden_size * 2 + q + 1);
+                const float* weight_xc_G_1 = weight_xc.row(hidden_size * 3 + q + 1);
+
+                const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+                const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+                const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+                const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
+                const float* weight_hc_I_1 = weight_hc.row(hidden_size * 0 + q + 1);
+                const float* weight_hc_F_1 = weight_hc.row(hidden_size * 1 + q + 1);
+                const float* weight_hc_O_1 = weight_hc.row(hidden_size * 2 + q + 1);
+                const float* weight_hc_G_1 = weight_hc.row(hidden_size * 3 + q + 1);
 
                 __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q / 2);
                 __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q / 2);
@@ -581,7 +687,7 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
                     weight_hc_IFOG += 8;
                 }
             }
-            for (; q < num_output; q++)
+            for (; q < hidden_size; q++)
             {
                 bias_c_IFOG[0] = (__fp16)bias_c_I[q];
                 bias_c_IFOG[1] = (__fp16)bias_c_F[q];
@@ -590,15 +696,15 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
 
                 bias_c_IFOG += 4;
 
-                const float* weight_xc_I = weight_xc.row(num_output * 0 + q);
-                const float* weight_xc_F = weight_xc.row(num_output * 1 + q);
-                const float* weight_xc_O = weight_xc.row(num_output * 2 + q);
-                const float* weight_xc_G = weight_xc.row(num_output * 3 + q);
+                const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+                const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+                const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+                const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
 
-                const float* weight_hc_I = weight_hc.row(num_output * 0 + q);
-                const float* weight_hc_F = weight_hc.row(num_output * 1 + q);
-                const float* weight_hc_O = weight_hc.row(num_output * 2 + q);
-                const float* weight_hc_G = weight_hc.row(num_output * 3 + q);
+                const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+                const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+                const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+                const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
 
                 __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q / 2 + q % 2);
                 __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q / 2 + q % 2);
@@ -626,7 +732,7 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
         }
         else
         {
-            for (int q = 0; q < num_output; q++)
+            for (int q = 0; q < hidden_size; q++)
             {
                 bias_c_IFOG[0] = (__fp16)bias_c_I[q];
                 bias_c_IFOG[1] = (__fp16)bias_c_F[q];
@@ -635,15 +741,15 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
 
                 bias_c_IFOG += 4;
 
-                const float* weight_xc_I = weight_xc.row(num_output * 0 + q);
-                const float* weight_xc_F = weight_xc.row(num_output * 1 + q);
-                const float* weight_xc_O = weight_xc.row(num_output * 2 + q);
-                const float* weight_xc_G = weight_xc.row(num_output * 3 + q);
+                const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+                const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+                const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+                const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
 
-                const float* weight_hc_I = weight_hc.row(num_output * 0 + q);
-                const float* weight_hc_F = weight_hc.row(num_output * 1 + q);
-                const float* weight_hc_O = weight_hc.row(num_output * 2 + q);
-                const float* weight_hc_G = weight_hc.row(num_output * 3 + q);
+                const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+                const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+                const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+                const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
 
                 __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q);
                 __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q);
@@ -671,6 +777,13 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
         }
     }
 
+    if (opt.lightmode)
+    {
+        weight_xc_data.release();
+        bias_c_data.release();
+        weight_hc_data.release();
+    }
+
     return 0;
 }
 
@@ -686,7 +799,7 @@ int LSTM_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option&
         return -100;
     hidden.fill(0.f);
 
-    Mat cell(num_output, 4u, opt.workspace_allocator);
+    Mat cell(hidden_size, 4u, opt.workspace_allocator);
     if (cell.empty())
         return -100;
     cell.fill(0.f);
@@ -698,7 +811,7 @@ int LSTM_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option&
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -713,14 +826,14 @@ int LSTM_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option&
         if (top_blob_reverse.empty())
             return -100;
 
-        int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret0 != 0)
             return ret0;
 
         hidden.fill(0.f);
         cell.fill(0.f);
 
-        int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, cell, opt);
+        int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -762,7 +875,7 @@ int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
             return -100;
         hidden.fill(0.f);
 
-        cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
+        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
         if (cell.empty())
             return -100;
         cell.fill(0.f);
@@ -776,7 +889,7 @@ int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -793,13 +906,13 @@ int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
 
         Mat hidden0 = hidden.row_range(0, 1);
         Mat cell0 = cell.row_range(0, 1);
-        int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
+        int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
         if (ret0 != 0)
             return ret0;
 
         Mat hidden1 = hidden.row_range(1, 1);
         Mat cell1 = cell.row_range(1, 1);
-        int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
+        int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -836,7 +949,7 @@ int LSTM_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option
         return -100;
     hidden.fill(0.f);
 
-    Mat cell(num_output, 4u, opt.workspace_allocator);
+    Mat cell(hidden_size, 4u, opt.workspace_allocator);
     if (cell.empty())
         return -100;
     cell.fill(0.f);
@@ -848,7 +961,7 @@ int LSTM_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -863,14 +976,14 @@ int LSTM_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option
         if (top_blob_reverse.empty())
             return -100;
 
-        int ret0 = lstm_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret0 = lstm_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret0 != 0)
             return ret0;
 
         hidden.fill(0.f);
         cell.fill(0.f);
 
-        int ret1 = lstm_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, cell, opt);
+        int ret1 = lstm_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -912,7 +1025,7 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
             return -100;
         hidden.fill(0.f);
 
-        cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
+        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
         if (cell.empty())
             return -100;
         cell.fill(0.f);
@@ -926,7 +1039,7 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
+        int ret = lstm_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -943,13 +1056,13 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
 
         Mat hidden0 = hidden.row_range(0, 1);
         Mat cell0 = cell.row_range(0, 1);
-        int ret0 = lstm_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
+        int ret0 = lstm_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
         if (ret0 != 0)
             return ret0;
 
         Mat hidden1 = hidden.row_range(1, 1);
         Mat cell1 = cell.row_range(1, 1);
-        int ret1 = lstm_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
+        int ret1 = lstm_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
         if (ret1 != 0)
             return ret1;
 
diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp
index 6f5ca231e9f..81046a3f0df 100644
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -35,30 +35,33 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
 {
     const Mat& q_blob = bottom_blobs[0];
     const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1];
-    const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2];
+    const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2];
 
-    size_t elemsize = q_blob.elemsize;
-    int elempack = q_blob.elempack;
+    size_t src_elemsize = q_blob.elemsize;
+    int src_elempack = q_blob.elempack;
+    size_t dst_elemsize = k_blob.elemsize;
+    int dst_elempack = k_blob.elempack;
 
-    const int seqlen = q_blob.h;
+    const int src_seqlen = q_blob.h;
+    const int dst_seqlen = k_blob.h;
     const int embed_dim_per_head = embed_dim / num_head;
     const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head);
 
 #if __ARM_NEON
-    if (elempack == 4)
+    if (src_elempack == 4)
     {
         Mat& top_blob = top_blobs[0];
-        top_blob.create(embed_dim, seqlen, elemsize, elempack, opt.blob_allocator);
+        top_blob.create(embed_dim, src_seqlen, src_elemsize, src_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -1;
 
-        Mat xq(embed_dim_per_head, seqlen, num_head, elemsize, elempack, opt.workspace_allocator);
-        Mat xk(embed_dim_per_head, seqlen, num_head, elemsize, elempack, opt.workspace_allocator);
-        Mat xv(seqlen, embed_dim_per_head, num_head, elemsize, elempack, opt.workspace_allocator);
+        Mat xq(embed_dim_per_head, src_seqlen, num_head, src_elemsize, src_elempack, opt.workspace_allocator);
+        Mat xk(embed_dim_per_head, dst_seqlen, num_head, dst_elemsize, dst_elempack, opt.workspace_allocator);
+        Mat xv(dst_seqlen, embed_dim_per_head, num_head, dst_elemsize, dst_elempack, opt.workspace_allocator);
 
-        Mat xqk(seqlen * elempack, seqlen, num_head, elemsize, elempack, opt.workspace_allocator);
+        Mat xqk(dst_seqlen * dst_elempack, src_seqlen, num_head, src_elemsize, src_elempack, opt.workspace_allocator);
 
-        Mat xqkv(embed_dim_per_head, num_head, seqlen, elemsize, elempack, opt.workspace_allocator);
+        Mat xqkv(embed_dim_per_head, num_head, src_seqlen, src_elemsize, src_elempack, opt.workspace_allocator);
 
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < num_head; q++)
@@ -67,7 +70,7 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
             {
                 Mat outm = xq.channel(q);
 
-                for (int i = 0; i < seqlen; i++)
+                for (int i = 0; i < src_seqlen; i++)
                 {
                     float* outptr = outm.row(i);
 
@@ -99,27 +102,43 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
             {
                 Mat outm = xk.channel(q);
 
-                for (int i = 0; i < seqlen; i++)
+                for (int i = 0; i < dst_seqlen; i++)
                 {
                     float* outptr = outm.row(i);
 
                     for (int j = 0; j < embed_dim_per_head; j++)
                     {
                         const float* ptr = k_blob.row(i);
-                        const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j);
+                        const float* kptr = (const float*)k_weight_data + kdim * (q * embed_dim_per_head + j);
 
-                        float32x4_t _sum = vdupq_n_f32(k_bias_data[q * embed_dim_per_head + j]);
-                        for (int k = 0; k < embed_dim; k++)
+                        if (dst_elempack == 4)
                         {
-                            float32x4_t _val = vld1q_f32(ptr);
-                            float32x4_t _k = vdupq_n_f32(kptr[0]);
-                            _sum = vmlaq_f32(_sum, _val, _k);
-                            ptr += 4;
-                            kptr += 1;
+                            float32x4_t _sum = vdupq_n_f32(k_bias_data[q * embed_dim_per_head + j]);
+                            for (int k = 0; k < kdim; k++)
+                            {
+                                float32x4_t _val = vld1q_f32(ptr);
+                                float32x4_t _k = vdupq_n_f32(kptr[0]);
+                                _sum = vmlaq_f32(_sum, _val, _k);
+                                ptr += 4;
+                                kptr += 1;
+                            }
+
+                            vst1q_f32(outptr, _sum);
+                            outptr += 4;
+                        }
+                        if (dst_elempack == 1)
+                        {
+                            float sum = k_bias_data[q * embed_dim_per_head + j];
+                            for (int k = 0; k < kdim; k++)
+                            {
+                                sum += ptr[0] * kptr[0];
+                                ptr += 1;
+                                kptr += 1;
+                            }
+
+                            outptr[0] = sum;
+                            outptr += 1;
                         }
-
-                        vst1q_f32(outptr, _sum);
-                        outptr += 4;
                     }
                 }
             }
@@ -132,30 +151,46 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
                 {
                     float* outptr = outm.row(i);
 
-                    for (int j = 0; j < seqlen; j++)
+                    for (int j = 0; j < dst_seqlen; j++)
                     {
                         const float* ptr = v_blob.row(j);
-                        const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i);
+                        const float* kptr = (const float*)v_weight_data + vdim * (q * embed_dim_per_head + i);
 
-                        float32x4_t _sum = vdupq_n_f32(v_bias_data[q * embed_dim_per_head + i]);
-                        for (int k = 0; k < embed_dim; k++)
+                        if (dst_elempack == 4)
                         {
-                            float32x4_t _val = vld1q_f32(ptr);
-                            float32x4_t _k = vdupq_n_f32(kptr[0]);
-                            _sum = vmlaq_f32(_sum, _val, _k);
-                            ptr += 4;
-                            kptr += 1;
+                            float32x4_t _sum = vdupq_n_f32(v_bias_data[q * embed_dim_per_head + i]);
+                            for (int k = 0; k < vdim; k++)
+                            {
+                                float32x4_t _val = vld1q_f32(ptr);
+                                float32x4_t _k = vdupq_n_f32(kptr[0]);
+                                _sum = vmlaq_f32(_sum, _val, _k);
+                                ptr += 4;
+                                kptr += 1;
+                            }
+
+                            vst1q_f32(outptr, _sum);
+                            outptr += 4;
+                        }
+                        if (dst_elempack == 1)
+                        {
+                            float sum = v_bias_data[q * embed_dim_per_head + i];
+                            for (int k = 0; k < vdim; k++)
+                            {
+                                sum += ptr[0] * kptr[0];
+                                ptr += 1;
+                                kptr += 1;
+                            }
+
+                            outptr[0] = sum;
+                            outptr += 1;
                         }
-
-                        vst1q_f32(outptr, _sum);
-                        outptr += 4;
                     }
                 }
             }
 
             // xqk = xq * xk
-            // xq  (embed_dim_per_head, seqlen)
-            // xk  (embed_dim_per_head, seqlen)
+            // xq  (embed_dim_per_head, src_seqlen)
+            // xk  (embed_dim_per_head, dst_seqlen)
             {
                 const Mat xqm = xq.channel(q);
                 const Mat xkm = xk.channel(q);
@@ -165,11 +200,11 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
                 Mat upxkm;
                 convert_packing(xkm, upxkm, 1);
 
-                for (int i = 0; i < seqlen; i++)
+                for (int i = 0; i < src_seqlen; i++)
                 {
                     float* outptr = outm.row(i);
 
-                    for (int j = 0; j < seqlen * elempack; j++)
+                    for (int j = 0; j < dst_seqlen * dst_elempack; j++)
                     {
                         const float* qptr = xqm.row(i);
                         const float* kptr = upxkm.row(j);
@@ -193,19 +228,19 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
             // softmax(xqk)
             {
                 Mat outm = xqk.channel(q);
-                for (int i = 0; i < seqlen; i++)
+                for (int i = 0; i < src_seqlen; i++)
                 {
                     float* ptr = outm.row(i);
 
                     float32x4_t _max = vdupq_n_f32(-FLT_MAX);
-                    for (int j = 0; j < seqlen * elempack; j++)
+                    for (int j = 0; j < dst_seqlen * dst_elempack; j++)
                     {
                         float32x4_t _p = vld1q_f32(ptr + j * 4);
                         _max = vmaxq_f32(_max, _p);
                     }
 
                     float32x4_t _sum = vdupq_n_f32(0.f);
-                    for (int j = 0; j < seqlen * elempack; j++)
+                    for (int j = 0; j < dst_seqlen * dst_elempack; j++)
                     {
                         float32x4_t _p = vld1q_f32(ptr + j * 4);
                         _p = exp_ps(vsubq_f32(_p, _max));
@@ -213,7 +248,7 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
                         _sum = vaddq_f32(_sum, _p);
                     }
 
-                    for (int j = 0; j < seqlen * elempack; j++)
+                    for (int j = 0; j < dst_seqlen * dst_elempack; j++)
                     {
                         float32x4_t _p = vld1q_f32(ptr + j * 4);
 #if __aarch64__
@@ -227,14 +262,14 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
             }
 
             // xqkv = xqk * xv
-            // xqk (seqlen, seqlen)
-            // xv  (seqlen, embed_dim_per_head)
-            // out (embed_dim_per_head, num_head, seqlen)
+            // xqk (dst_seqlen, src_seqlen)
+            // xv  (dst_seqlen, embed_dim_per_head)
+            // out (embed_dim_per_head, num_head, src_seqlen)
             {
                 const Mat xqkm = xqk.channel(q);
                 const Mat xvm = xv.channel(q);
 
-                for (int i = 0; i < seqlen; i++)
+                for (int i = 0; i < src_seqlen; i++)
                 {
                     float* outptr = xqkv.channel(i).row(q);
 
@@ -244,7 +279,7 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
                         const float* vptr = xvm.row(j);
 
                         float32x4_t _sum = vdupq_n_f32(0.f);
-                        for (int k = 0; k < seqlen * elempack; k++)
+                        for (int k = 0; k < dst_seqlen * dst_elempack; k++)
                         {
                             float32x4_t _qk = vld1q_f32(qkptr);
                             float32x4_t _v = vdupq_n_f32(vptr[0]);
@@ -261,9 +296,9 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
         }
 
         // out = affine(xqkv)
-        // xqkv  (embed_dim, seqlen)
+        // xqkv  (embed_dim, src_seqlen)
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int i = 0; i < seqlen; i++)
+        for (int i = 0; i < src_seqlen; i++)
         {
             float* outptr = top_blob.row(i);
 
@@ -292,7 +327,14 @@ int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::v
 #endif // __ARM_NEON
 
     // fallback to native implement
-    return MultiHeadAttention::forward(bottom_blobs, top_blobs, opt);
+    std::vector<Mat> bottom_blobs_unpacked = bottom_blobs;
+    if (dst_elempack == 4)
+    {
+        convert_packing(bottom_blobs[1], bottom_blobs_unpacked[1], 1, opt);
+        if (bottom_blobs.size() == 3)
+            convert_packing(bottom_blobs[2], bottom_blobs_unpacked[2], 1, opt);
+    }
+    return MultiHeadAttention::forward(bottom_blobs_unpacked, top_blobs, opt);
 }
 
 } // namespace ncnn
diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
index f1759f5188f..074681809bf 100644
--- a/src/layer/arm/neon_mathfun_fp16s.h
+++ b/src/layer/arm/neon_mathfun_fp16s.h
@@ -89,9 +89,9 @@ static inline float16x4_t log_ps(float16x4_t x)
      *     } else { x = x - 1.0; }
      */
     uint16x4_t mask = vclt_f16(x, vdup_n_f16(c_cephes_SQRTHF));
-    float16x4_t tmp = vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(x), mask));
+    float16x4_t tmp = (float16x4_t)(vand_u16((uint16x4_t)(x), mask));
     x = vsub_f16(x, one);
-    e = vsub_f16(e, vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(one), mask)));
+    e = vsub_f16(e, (float16x4_t)(vand_u16((uint16x4_t)(one), mask)));
     x = vadd_f16(x, tmp);
 
     float16x4_t z = vmul_f16(x, x);
@@ -115,7 +115,7 @@ static inline float16x4_t log_ps(float16x4_t x)
 
     x = vadd_f16(x, y);
     x = vfma_f16(x, e, vdup_n_f16(c_cephes_log_q2));
-    x = vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(x), invalid_mask)); // negative arg will be NAN
+    x = (float16x4_t)(vorr_u16((uint16x4_t)(x), invalid_mask)); // negative arg will be NAN
     return x;
 }
 
@@ -208,9 +208,9 @@ static inline float16x4_t exp_ps(float16x4_t x)
 
     /* if greater, substract 1 */
     uint16x4_t mask = vcgt_f16(tmp, fx);
-    mask = vand_u16(mask, vreinterpret_u16_f16(one));
+    mask = vand_u16(mask, (uint16x4_t)(one));
 
-    fx = vsub_f16(tmp, vreinterpret_f16_u16(mask));
+    fx = vsub_f16(tmp, (float16x4_t)(mask));
 
     tmp = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C1));
     float16x4_t z = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C2));
@@ -489,7 +489,7 @@ static inline float16x4_t tanh_ps(float16x4_t x)
 
     // clamp the inputs to the range [-9, 9] since anything outside
     // this range is -/+1.0f in single-precision.
-    x2 = vreinterpret_f16_u16(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), vreinterpret_u16_f16(x2), vreinterpret_u16_f16(vdup_n_f16(c_tanh_hi))));
+    x2 = (float16x4_t)(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), (uint16x4_t)(x2), (uint16x4_t)(vdup_n_f16(c_tanh_hi))));
 
     // since the polynomials are odd/even, we need x**2.
     float16x4_t z = vmul_f16(x2, x2);
@@ -514,10 +514,10 @@ static inline float16x4_t tanh_ps(float16x4_t x)
     y = vdiv_f16(y, w);
 
     // reinstate the sign.
-    y = vreinterpret_f16_u16(vbsl_u16(vdup_n_u16(1u << 15), vreinterpret_u16_f16(x), vreinterpret_u16_f16(y)));
+    y = (float16x4_t)(vbsl_u16(vdup_n_u16(1u << 15), (uint16x4_t)(x), (uint16x4_t)(y)));
 
     // when the argument is very small in magnitude it's more accurate to just return it.
-    y = vreinterpret_f16_u16(vbsl_u16(tiny_mask, vreinterpret_u16_f16(y), vreinterpret_u16_f16(x)));
+    y = (float16x4_t)(vbsl_u16(tiny_mask, (uint16x4_t)(y), (uint16x4_t)(x)));
 
     return y;
 }
diff --git a/src/layer/arm/pooling_3x3.h b/src/layer/arm/pooling_3x3.h
index 5c0b281a379..f105aeea756 100644
--- a/src/layer/arm/pooling_3x3.h
+++ b/src/layer/arm/pooling_3x3.h
@@ -69,7 +69,7 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const O
                     "prfm       pldl1keep, [%3, #256]       \n"
                     "ld2        {v10.4s, v11.4s}, [%3], #32 \n"
 
-                    "ext        v2.16b,  v2.16b, v8.16b, #4 \n"
+                    "ext        v2.16b, v2.16b, v8.16b, #4  \n"
 
                     "fmax       v12.4s, v12.4s, v0.4s       \n"
                     "ext        v4.16b, v4.16b, v10.16b, #4 \n"
diff --git a/src/layer/arm/softmax_arm.cpp b/src/layer/arm/softmax_arm.cpp
index c00e3d4414e..77a0e696479 100644
--- a/src/layer/arm/softmax_arm.cpp
+++ b/src/layer/arm/softmax_arm.cpp
@@ -76,15 +76,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             _sum = vaddq_f32(_sum, vrev64q_f32(_sum));
             _sum = vaddq_f32(_sum, vextq_f32(_sum, _sum, 2));
 #endif
-
+            float32x4_t _reciprocal_sum = vrecpeq_f32(_sum);
+            _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum);
             for (int i = 0; i < w; i++)
             {
                 float32x4_t _p = vld1q_f32(ptr + i * 4);
-#if __aarch64__
-                _p = vdivq_f32(_p, _sum);
-#else
-                _p = div_ps(_p, _sum);
-#endif
+                _p = vmulq_f32(_p, _reciprocal_sum);
                 vst1q_f32(ptr + i * 4, _p);
             }
         }
@@ -152,11 +149,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 {
                     float32x4_t _p = vld1q_f32(ptr);
                     float32x4_t _sum = vdupq_n_f32(sum[j]);
-#if __aarch64__
-                    _p = vdivq_f32(_p, _sum);
-#else
                     _p = div_ps(_p, _sum);
-#endif
                     vst1q_f32(ptr, _p);
                     ptr += 4;
                 }
@@ -189,14 +182,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     _sum = vaddq_f32(_sum, _p);
                 }
 
+                float32x4_t _reciprocal_sum = vrecpeq_f32(_sum);
+                _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum);
                 for (int j = 0; j < w; j++)
                 {
                     float32x4_t _p = vld1q_f32(ptr + j * 4);
-#if __aarch64__
-                    _p = vdivq_f32(_p, _sum);
-#else
-                    _p = div_ps(_p, _sum);
-#endif
+                    _p = vmulq_f32(_p, _reciprocal_sum);
                     vst1q_f32(ptr + j * 4, _p);
                 }
             }
@@ -269,11 +260,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 {
                     float32x4_t _p = vld1q_f32(ptr);
                     float32x4_t _sum = vdupq_n_f32(sum[i]);
-#if __aarch64__
-                    _p = vdivq_f32(_p, _sum);
-#else
                     _p = div_ps(_p, _sum);
-#endif
                     vst1q_f32(ptr, _p);
                     ptr += 4;
                 }
@@ -356,11 +343,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     {
                         float32x4_t _p = vld1q_f32(ptr);
                         float32x4_t _sum = vld1q_f32(sumptr);
-#if __aarch64__
-                        _p = vdivq_f32(_p, _sum);
-#else
                         _p = div_ps(_p, _sum);
-#endif
                         vst1q_f32(ptr, _p);
                         ptr += 4;
                         sumptr += 4;
@@ -398,14 +381,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                         _sum = vaddq_f32(_sum, _p);
                     }
 
+                    float32x4_t _reciprocal_sum = vrecpeq_f32(_sum);
+                    _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum);
                     for (int j = 0; j < w; j++)
                     {
                         float32x4_t _p = vld1q_f32(ptr + j * 4);
-#if __aarch64__
-                        _p = vdivq_f32(_p, _sum);
-#else
-                        _p = div_ps(_p, _sum);
-#endif
+                        _p = vmulq_f32(_p, _reciprocal_sum);
                         vst1q_f32(ptr + j * 4, _p);
                     }
 
@@ -480,14 +461,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int i = 0;
 #if __ARM_NEON
             float32x4_t _sum = vdupq_n_f32(sum);
+            float32x4_t _reciprocal_sum = vrecpeq_f32(_sum);
+            _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum);
             for (; i + 3 < w; i += 4)
             {
                 float32x4_t _p = vld1q_f32(ptr + i);
-#if __aarch64__
-                _p = vdivq_f32(_p, _sum);
-#else
-                _p = div_ps(_p, _sum);
-#endif
+                _p = vmulq_f32(_p, _reciprocal_sum);
                 vst1q_f32(ptr + i, _p);
             }
 #endif // __ARM_NEON
@@ -587,11 +566,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             {
                 float32x4_t _p = vld1q_f32(ptr);
                 float32x4_t _sum = vld1q_f32(psum);
-#if __aarch64__
-                _p = vdivq_f32(_p, _sum);
-#else
                 _p = div_ps(_p, _sum);
-#endif
                 vst1q_f32(ptr, _p);
 
                 ptr += 4;
@@ -674,14 +649,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 int j = 0;
 #if __ARM_NEON
                 float32x4_t _sum = vdupq_n_f32(sum);
+                float32x4_t _reciprocal_sum = vrecpeq_f32(_sum);
+                _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum);
                 for (; j + 3 < w; j += 4)
                 {
                     float32x4_t _p = vld1q_f32(ptr + j);
-#if __aarch64__
-                    _p = vdivq_f32(_p, _sum);
-#else
-                    _p = div_ps(_p, _sum);
-#endif
+                    _p = vmulq_f32(_p, _reciprocal_sum);
                     vst1q_f32(ptr + j, _p);
                 }
 #endif // __ARM_NEON
@@ -790,11 +763,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             {
                 float32x4_t _p = vld1q_f32(ptr);
                 float32x4_t _sum = vld1q_f32(sumptr);
-#if __aarch64__
-                _p = vdivq_f32(_p, _sum);
-#else
                 _p = div_ps(_p, _sum);
-#endif // __aarch64__
                 vst1q_f32(ptr, _p);
 
                 ptr += 4;
@@ -902,11 +871,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 {
                     float32x4_t _p = vld1q_f32(ptr + j);
                     float32x4_t _sum = vld1q_f32(sumptr + j);
-#if __aarch64__
-                    _p = vdivq_f32(_p, _sum);
-#else
                     _p = div_ps(_p, _sum);
-#endif
                     vst1q_f32(ptr + j, _p);
                 }
 #endif // __ARM_NEON
@@ -989,14 +954,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     int j = 0;
 #if __ARM_NEON
                     float32x4_t _sum = vdupq_n_f32(sum);
+                    float32x4_t _reciprocal_sum = vrecpeq_f32(_sum);
+                    _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum);
                     for (; j + 3 < w; j += 4)
                     {
                         float32x4_t _p = vld1q_f32(ptr + j);
-#if __aarch64__
-                        _p = vdivq_f32(_p, _sum);
-#else
-                        _p = div_ps(_p, _sum);
-#endif
+                        _p = vmulq_f32(_p, _reciprocal_sum);
                         vst1q_f32(ptr + j, _p);
                     }
 #endif // __ARM_NEON
diff --git a/src/layer/expanddims.cpp b/src/layer/expanddims.cpp
index 4b253d7e1c4..473a3b71b37 100644
--- a/src/layer/expanddims.cpp
+++ b/src/layer/expanddims.cpp
@@ -26,6 +26,7 @@ int ExpandDims::load_param(const ParamDict& pd)
 {
     expand_w = pd.get(0, 0);
     expand_h = pd.get(1, 0);
+    expand_d = pd.get(11, 0);
     expand_c = pd.get(2, 0);
     axes = pd.get(3, Mat());
 
@@ -36,16 +37,19 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
 
     bool _expand_w = false;
     bool _expand_h = false;
+    bool _expand_d = false;
     bool _expand_c = false;
 
     if (axes.empty())
     {
         _expand_w = expand_w;
         _expand_h = expand_h;
+        _expand_d = expand_d;
         _expand_c = expand_c;
     }
     else
@@ -77,6 +81,22 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
             {
                 _expand_w = true;
             }
+            if (dims == 3 && axis == 0)
+            {
+                _expand_c = true;
+            }
+            if (dims == 3 && axis == 1)
+            {
+                _expand_d = true;
+            }
+            if (dims == 3 && axis == 2)
+            {
+                _expand_h = true;
+            }
+            if (dims == 3 && axis == 3)
+            {
+                _expand_w = true;
+            }
         }
     }
 
@@ -114,6 +134,26 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
         }
     }
 
+    if (dims == 3)
+    {
+        if (_expand_w)
+        {
+            top_blob = bottom_blob.reshape(1, w, h, channels, opt.blob_allocator);
+        }
+        else if (_expand_h)
+        {
+            top_blob = bottom_blob.reshape(w, 1, h, channels, opt.blob_allocator);
+        }
+        else if (_expand_d)
+        {
+            top_blob = bottom_blob.reshape(w, h, 1, channels, opt.blob_allocator);
+        }
+        else if (_expand_c)
+        {
+            top_blob = bottom_blob.reshape(w, h, channels, 1, opt.blob_allocator);
+        }
+    }
+
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/expanddims.h b/src/layer/expanddims.h
index 71933149eaf..4c8c990f756 100644
--- a/src/layer/expanddims.h
+++ b/src/layer/expanddims.h
@@ -31,6 +31,7 @@ class ExpandDims : public Layer
 public:
     int expand_w;
     int expand_h;
+    int expand_d;
     int expand_c;
     Mat axes;
 };
diff --git a/src/layer/fold.cpp b/src/layer/fold.cpp
new file mode 100644
index 00000000000..c14f01fbb72
--- /dev/null
+++ b/src/layer/fold.cpp
@@ -0,0 +1,124 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fold.h"
+
+namespace ncnn {
+
+Fold::Fold()
+{
+    one_blob_only = true;
+}
+
+int Fold::load_param(const ParamDict& pd)
+{
+    kernel_w = pd.get(1, 0);
+    kernel_h = pd.get(11, kernel_w);
+    dilation_w = pd.get(2, 1);
+    dilation_h = pd.get(12, dilation_w);
+    stride_w = pd.get(3, 1);
+    stride_h = pd.get(13, stride_w);
+    pad_left = pd.get(4, 0);
+    pad_right = pd.get(15, pad_left);
+    pad_top = pd.get(14, pad_left);
+    pad_bottom = pd.get(16, pad_top);
+    output_w = pd.get(20, 0);
+    output_h = pd.get(21, output_w);
+
+    return 0;
+}
+
+int Fold::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int size = bottom_blob.w;
+    const int max_channels = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int outw = output_w + pad_left + pad_right;
+    const int outh = output_h + pad_top + pad_bottom;
+
+    const int inw = (outw - kernel_extent_w) / stride_w + 1;
+    const int inh = (outh - kernel_extent_h) / stride_h + 1;
+
+    // assert inw * inh == size
+
+    const int maxk = kernel_w * kernel_h;
+    const int channels = max_channels / maxk;
+
+    Mat top_blob_bordered;
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+    {
+        top_blob_bordered.create(outw, outh, channels, elemsize, opt.workspace_allocator);
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, channels, elemsize, opt.blob_allocator);
+    }
+    if (top_blob_bordered.empty())
+        return -100;
+
+    // col2im
+    const int gap = outw * stride_h - inw * stride_w;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const float* sptr = bottom_blob.row(p * maxk);
+        Mat outm = top_blob_bordered.channel(p);
+
+        outm.fill(0.f);
+
+        for (int u = 0; u < kernel_h; u++)
+        {
+            for (int v = 0; v < kernel_w; v++)
+            {
+                float* ptr = outm.row(dilation_h * u) + dilation_w * v;
+
+                for (int i = 0; i < inh; i++)
+                {
+                    for (int j = 0; j < inw; j++)
+                    {
+                        ptr[0] += sptr[0];
+
+                        ptr += stride_w;
+                        sptr += 1;
+                    }
+
+                    ptr += gap;
+                }
+            }
+        }
+    }
+
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+    {
+        Option opt_b = opt;
+        opt_b.use_packing_layout = false;
+        copy_cut_border(top_blob_bordered, top_blob, pad_top, pad_bottom, pad_left, pad_right, opt_b);
+        if (top_blob.empty())
+            return -100;
+    }
+    else
+    {
+        top_blob = top_blob_bordered;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/fold.h b/src/layer/fold.h
new file mode 100644
index 00000000000..e6eccad556a
--- /dev/null
+++ b/src/layer/fold.h
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_FOLD_H
+#define LAYER_FOLD_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Fold : public Layer
+{
+public:
+    Fold();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+public:
+    int kernel_w;
+    int kernel_h;
+    int dilation_w;
+    int dilation_h;
+    int stride_w;
+    int stride_h;
+    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
+    int pad_right;
+    int pad_top;
+    int pad_bottom;
+    int output_w;
+    int output_h;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_FOLD_H
diff --git a/src/layer/glu.cpp b/src/layer/glu.cpp
index 245341a5395..bf99a4cd1ea 100644
--- a/src/layer/glu.cpp
+++ b/src/layer/glu.cpp
@@ -18,187 +18,203 @@
 
 namespace ncnn {
 
-GLU::GLU() {
-  one_blob_only = true;
-  support_inplace = false;
+GLU::GLU()
+{
+    one_blob_only = true;
+    support_inplace = false;
 }
 
-int GLU::load_param(const ParamDict &pd) {
-  axis = pd.get(0, 0);
+int GLU::load_param(const ParamDict& pd)
+{
+    axis = pd.get(0, 0);
 
-  return 0;
+    return 0;
 }
 
-int GLU::forward(const Mat &bottom_blob, Mat &top_blob,
-                 const Option &opt) const {
-  int dims = bottom_blob.dims;
-  int positive_axis = axis < 0 ? dims + axis : axis;
+int GLU::forward(const Mat& bottom_blob, Mat& top_blob,
+                 const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int positive_axis = axis < 0 ? dims + axis : axis;
 
-  if (dims == 1) {  // ignore axis
-    int w = bottom_blob.w;
-    int out_w = w / 2;
-    top_blob.create(out_w, sizeof(float), opt.blob_allocator);
+    if (dims == 1)
+    {   // ignore axis
+        int w = bottom_blob.w;
+        int out_w = w / 2;
+        top_blob.create(out_w, sizeof(float), opt.blob_allocator);
 
-    const float *in_ptr = bottom_blob;
-    float *out_ptr = top_blob;
+        const float* in_ptr = bottom_blob;
+        float* out_ptr = top_blob;
 
-#pragma omp parallel for num_threads(opt.num_threads)
-    for (int x = 0; x < out_w; ++x) {
-      float sigmoid =
-          static_cast<float>(1.f / (1.f + expf(-in_ptr[x + out_w])));
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int x = 0; x < out_w; ++x)
+        {
+            float sigmoid = static_cast<float>(1.f / (1.f + expf(-in_ptr[x + out_w])));
 
-      out_ptr[x] = in_ptr[x] * sigmoid;
-    }
+            out_ptr[x] = in_ptr[x] * sigmoid;
+        }
 
-    return 0;
-  }  // if (dims == 1)
+        return 0;
+    } // if (dims == 1)
 
-  if (dims == 2 && positive_axis == 0) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int out_w = w;
-    int out_h = h / 2;
-    top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);
+    if (dims == 2 && positive_axis == 0)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int out_w = w;
+        int out_h = h / 2;
+        top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);
 
-    int offset = out_w * out_h;
+        int offset = out_w * out_h;
 
 #if 0
-#pragma omp parallel for num_threads(opt.num_threads)
-    for (int y = 0; y < out_h; ++y) {
-      const float *in_ptr = bottom_blob.row(y);
-      float *out_ptr = top_blob.row(y);
-
-      for (int x = 0; x < w; ++x) {
-        float sigmoid =
-            static_cast<float>(1.f / (1.f + exp(-in_ptr[x + offset])));
-
-        out_ptr[x] = in_ptr[x] * sigmoid;
-      }
-    }
+        // this one is equivalent to the else branch. It is more readable
+        // but less efficient
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int y = 0; y < out_h; ++y) {
+            const float *in_ptr = bottom_blob.row(y);
+            float *out_ptr = top_blob.row(y);
+
+            for (int x = 0; x < w; ++x) {
+                float sigmoid =
+                    static_cast<float>(1.f / (1.f + exp(-in_ptr[x + offset])));
+
+                out_ptr[x] = in_ptr[x] * sigmoid;
+            }
+        }
 #else
-    int size = offset;
-    const float *in_ptr = bottom_blob;
-    float *out_ptr = top_blob;
-
-#pragma omp parallel for num_threads(opt.num_threads)
-    for (int i = 0; i < size; ++i) {
-      float sigmoid =
-          static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
-      out_ptr[i] = in_ptr[i] * sigmoid;
-    }
+        int size = offset;
+        const float* in_ptr = bottom_blob;
+        float* out_ptr = top_blob;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < size; ++i)
+        {
+            float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
+            out_ptr[i] = in_ptr[i] * sigmoid;
+        }
 #endif
 
-    return 0;
-  }  // if (dims == 2 && positive_axis == 0)
-
-  if (dims == 2 && positive_axis == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int out_w = w / 2;
-    int out_h = h;
-
-    top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);
-
-#pragma omp parallel for num_threads(opt.num_threads)
-    for (int y = 0; y < h; ++y) {
-      const float *in_ptr = bottom_blob.row(y);
-      float *out_ptr = top_blob.row(y);
-
-      for (int x = 0; x < out_w; ++x) {
-        float sigmoid =
-            static_cast<float>(1.f / (1.f + exp(-in_ptr[x + out_w])));
-        out_ptr[x] = in_ptr[x] * sigmoid;
-      }
-    }
-
-    return 0;
-  }  // if (dims == 2 && positive_axis == 1)
-
-  if (dims == 3 && positive_axis == 0) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int c = bottom_blob.c;
-
-    int out_w = w;
-    int out_h = h;
-    int out_c = c / 2;
-
-    top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
-
-    int offset = out_c * bottom_blob.cstep;
-    int size = w * h;
+        return 0;
+    } // if (dims == 2 && positive_axis == 0)
+
+    if (dims == 2 && positive_axis == 1)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int out_w = w / 2;
+        int out_h = h;
+
+        top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int y = 0; y < h; ++y)
+        {
+            const float* in_ptr = bottom_blob.row(y);
+            float* out_ptr = top_blob.row(y);
+
+            for (int x = 0; x < out_w; ++x)
+            {
+                float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[x + out_w])));
+                out_ptr[x] = in_ptr[x] * sigmoid;
+            }
+        }
 
-#pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < out_c; ++q) {
-      const float *in_ptr = bottom_blob.channel(q);
-      float *out_ptr = top_blob.channel(q);
+        return 0;
+    } // if (dims == 2 && positive_axis == 1)
 
-      for (int i = 0; i < size; ++i) {
-        float sigmoid =
-            static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
-        out_ptr[i] = in_ptr[i] * sigmoid;
-      }
-    }
-    return 0;
-  }  //   if (dims == 3 && positive_axis == 0) {
+    if (dims == 3 && positive_axis == 0)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int c = bottom_blob.c;
 
-  if (dims == 3 && positive_axis == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int c = bottom_blob.c;
+        int out_w = w;
+        int out_h = h;
+        int out_c = c / 2;
 
-    int out_w = w;
-    int out_h = h / 2;
-    int out_c = c;
+        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
 
-    top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
+        int offset = out_c * bottom_blob.cstep;
+        int size = w * h;
 
-    int offset = out_h * out_w;
-    int size = offset;
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < out_c; ++q)
+        {
+            const float* in_ptr = bottom_blob.channel(q);
+            float* out_ptr = top_blob.channel(q);
 
-#pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < c; ++q) {
-      const float *in_ptr = bottom_blob.channel(q);
-      float *out_ptr = top_blob.channel(q);
-
-      for (int i = 0; i < size; ++i) {
-        float sigmoid =
-            static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
-        out_ptr[i] = in_ptr[i] * sigmoid;
-      }
-    }
-    return 0;
-  }  // if (dims == 3 && positive_axis == 1)
-
-  if (dims == 3 && positive_axis == 2) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int c = bottom_blob.c;
-
-    int out_w = w / 2;
-    int out_h = h;
-    int out_c = c;
-
-    top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
-
-#pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < c; ++q) {
-      const float *in_ptr = bottom_blob.channel(q);
-      float *out_ptr = top_blob.channel(q);
-      for (int y = 0; y < h; ++y) {
-        for (int x = 0; x < out_w; ++x) {
-          float sigmoid =
-              static_cast<float>(1.f / (1.f + exp(-in_ptr[x + out_w])));
-          out_ptr[x] = in_ptr[x] * sigmoid;
+            for (int i = 0; i < size; ++i)
+            {
+                float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
+                out_ptr[i] = in_ptr[i] * sigmoid;
+            }
         }
-        in_ptr += w;
-        out_ptr += out_w;
-      }
-    }
-    return 0;
-  }  // if (dims == 3 && positive_axis == 2)
+        return 0;
+    } //   if (dims == 3 && positive_axis == 0) {
+
+    if (dims == 3 && positive_axis == 1)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int c = bottom_blob.c;
+
+        int out_w = w;
+        int out_h = h / 2;
+        int out_c = c;
+
+        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
+
+        int offset = out_h * out_w;
+        int size = offset;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; ++q)
+        {
+            const float* in_ptr = bottom_blob.channel(q);
+            float* out_ptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; ++i)
+            {
+                float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
+                out_ptr[i] = in_ptr[i] * sigmoid;
+            }
+        }
+        return 0;
+    } // if (dims == 3 && positive_axis == 1)
+
+    if (dims == 3 && positive_axis == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int c = bottom_blob.c;
+
+        int out_w = w / 2;
+        int out_h = h;
+        int out_c = c;
+
+        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; ++q)
+        {
+            const float* in_ptr = bottom_blob.channel(q);
+            float* out_ptr = top_blob.channel(q);
+            for (int y = 0; y < h; ++y)
+            {
+                for (int x = 0; x < out_w; ++x)
+                {
+                    float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[x + out_w])));
+                    out_ptr[x] = in_ptr[x] * sigmoid;
+                }
+                in_ptr += w;
+                out_ptr += out_w;
+            }
+        }
+        return 0;
+    } // if (dims == 3 && positive_axis == 2)
 
-  return -100;
+    return -100;
 }
 
-}  // namespace ncnn
+} // namespace ncnn
diff --git a/src/layer/glu.h b/src/layer/glu.h
index 762730635e1..00368295592 100644
--- a/src/layer/glu.h
+++ b/src/layer/glu.h
@@ -19,19 +19,20 @@
 
 namespace ncnn {
 
-class GLU : public Layer {
- public:
-  GLU();
+class GLU : public Layer
+{
+public:
+    GLU();
 
-  virtual int load_param(const ParamDict &pd);
+    virtual int load_param(const ParamDict& pd);
 
-  virtual int forward(const Mat &bottom_blob, Mat &top_blob,
-                      const Option &opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob,
+                        const Option& opt) const;
 
- public:
-  int axis;
+public:
+    int axis;
 };
 
-}  // namespace ncnn
+} // namespace ncnn
 
-#endif  // LAYER_GLU_H
+#endif // LAYER_GLU_H
diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp
new file mode 100644
index 00000000000..c9e3969100b
--- /dev/null
+++ b/src/layer/gridsample.cpp
@@ -0,0 +1,451 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// coord compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to coord writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "gridsample.h"
+
+#include <math.h>
+
+namespace ncnn {
+
+GridSample::GridSample()
+{
+    one_blob_only = false;
+    support_inplace = false;
+}
+
+int GridSample::load_param(const ParamDict& pd)
+{
+    sample_type = pd.get(0, 1);
+    padding_mode = pd.get(1, 1);
+    align_corner = pd.get(2, 0);
+
+    if (sample_type < 1 || sample_type > 3)
+    {
+        NCNN_LOGE("unsupported sample type %d", sample_type);
+        return -1;
+    }
+
+    if (padding_mode < 1 || padding_mode > 3)
+    {
+        NCNN_LOGE("unsupported padding mode %d", padding_mode);
+        return -1;
+    }
+
+    return 0;
+}
+
+// Restore normalized location to acutal image location
+//   When align_corners is true:
+//     Normalized location (-1, -1) points to the top-left pixel.
+//     Normalized location (1, 1) points to the bottom-tight pixel.
+//   When align_corners is false [default]:
+//     Normalized location (-1, -1) points to the top-left pixel minus half
+//     pixel coord both directions, i.e, (-0.5, -0.5) coord acutal image space.
+//     Normalized location (1, 1) points to the bottom-tight pixel plus half
+//     pixel coord both directions, i.e. (H - 0.5, W - 0.5) coord acutal image space.
+static float grid_sample_unormalize(int w, float coordx, int align_corner)
+{
+    return align_corner ? (coordx + 1) / 2.f * (w - 1) : ((coordx + 1) * w - 1) / 2.f;
+}
+
+static float border_coord(int x, int border)
+{
+    return std::min(border, std::max(x, 0));
+}
+
+static float reflect_coord(float x, int high)
+{
+    x = abs(x);
+    x = high - abs(x - high);
+    return x;
+}
+
+static int compute_coord(int sx, int w, int padding_mode, int align_corner)
+{
+    if (padding_mode == 2) // border
+    {
+        sx = border_coord(sx, w - 1);
+    }
+    else if (padding_mode == 3) // reflection
+    {
+        if (align_corner)
+        {
+            sx = reflect_coord(sx, w - 1);
+        }
+        else
+        {
+            sx = static_cast<int>(reflect_coord(sx + 0.5, w) - 0.5);
+            sx = border_coord(sx, w - 1);
+        }
+    }
+
+    return sx;
+}
+
+static bool in_bounds(const Mat& image, int x, int y)
+{
+    return x >= 0 && y >= 0 && x < image.w && y < image.h;
+}
+
+static bool in_bounds(const Mat& image, int x, int y, int z)
+{
+    return x >= 0 && y >= 0 && z >= 0 && x < image.w && y < image.h && z < image.c;
+}
+
+static float get_value_bounded(const Mat& image, int x, int y)
+{
+    return in_bounds(image, x, y) ? image.row(y)[x] : 0.f;
+}
+
+static float get_value_bounded(const Mat& image, int x, int y, int z)
+{
+    return in_bounds(image, x, y, z) ? image.channel(z).row(y)[x] : 0.f;
+}
+
+static float get_value_bounded(const Mat& image, int x, int y, int padding_mode, int align_corner)
+{
+    x = compute_coord(x, image.w, padding_mode, align_corner);
+    y = compute_coord(y, image.h, padding_mode, align_corner);
+
+    return get_value_bounded(image, x, y);
+}
+
+static float get_value_bounded(const Mat& image, int x, int y, int z, int padding_mode, int align_corner)
+{
+    x = compute_coord(x, image.w, padding_mode, align_corner);
+    y = compute_coord(y, image.h, padding_mode, align_corner);
+    z = compute_coord(z, image.c, padding_mode, align_corner);
+
+    return get_value_bounded(image, x, y, z);
+}
+
+static inline void interpolate_cubic(float fx, float* coeffs)
+{
+    const float A = -0.75f;
+
+    float fx0 = fx + 1;
+    float fx1 = fx;
+    float fx2 = 1 - fx;
+    // float fx3 = 2 - fx;
+
+    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
+    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
+    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& grid = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+
+    if (dims == 3)
+    {
+        int outw = grid.h;
+        int outh = grid.c;
+
+        top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (sample_type == 1) // bilinear
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat image = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    const float* gridptr = grid.channel(y);
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        float sample_x = gridptr[0];
+                        float sample_y = gridptr[1];
+
+                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+
+                        // bilinear interpolate
+                        float v;
+                        {
+                            int x0 = (int)floor(sample_x);
+                            int y0 = (int)floor(sample_y);
+                            int x1 = x0 + 1;
+                            int y1 = y0 + 1;
+
+                            float v00 = get_value_bounded(image, x0, y0, padding_mode, align_corner);
+                            float v01 = get_value_bounded(image, x1, y0, padding_mode, align_corner);
+                            float v10 = get_value_bounded(image, x0, y1, padding_mode, align_corner);
+                            float v11 = get_value_bounded(image, x1, y1, padding_mode, align_corner);
+
+                            float alpha = sample_x - x0;
+                            float beta = sample_y - y0;
+
+                            float v0 = v00 * (1 - alpha) + v01 * alpha;
+                            float v1 = v10 * (1 - alpha) + v11 * alpha;
+
+                            v = v0 * (1 - beta) + v1 * beta;
+                        }
+
+                        outptr[0] = v;
+                        outptr += 1;
+
+                        gridptr += 2;
+                    }
+                }
+            }
+        }
+        else if (sample_type == 2) // nearest
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat image = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    const float* gridptr = grid.channel(y);
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        float sample_x = gridptr[0];
+                        float sample_y = gridptr[1];
+
+                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+
+                        int x0 = static_cast<int>(round(sample_x));
+                        int y0 = static_cast<int>(round(sample_y));
+
+                        float v = get_value_bounded(image, x0, y0, padding_mode, align_corner);
+
+                        outptr[0] = v;
+                        outptr += 1;
+
+                        gridptr += 2;
+                    }
+                }
+            }
+        }
+        else if (sample_type == 3) // bicubic
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat image = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    const float* gridptr = grid.channel(y);
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        float sample_x = gridptr[0];
+                        float sample_y = gridptr[1];
+
+                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+
+                        // bicubic interpolate
+                        float v;
+                        {
+                            int x1 = floor(sample_x);
+                            int y1 = floor(sample_y);
+                            int x0 = x1 - 1;
+                            int y0 = y1 - 1;
+                            int x2 = x1 + 1;
+                            int y2 = y1 + 1;
+                            int x3 = x1 + 2;
+                            int y3 = y1 + 2;
+
+                            float v00 = get_value_bounded(image, x0, y0, padding_mode, align_corner);
+                            float v01 = get_value_bounded(image, x1, y0, padding_mode, align_corner);
+                            float v02 = get_value_bounded(image, x2, y0, padding_mode, align_corner);
+                            float v03 = get_value_bounded(image, x3, y0, padding_mode, align_corner);
+                            float v10 = get_value_bounded(image, x0, y1, padding_mode, align_corner);
+                            float v11 = get_value_bounded(image, x1, y1, padding_mode, align_corner);
+                            float v12 = get_value_bounded(image, x2, y1, padding_mode, align_corner);
+                            float v13 = get_value_bounded(image, x3, y1, padding_mode, align_corner);
+                            float v20 = get_value_bounded(image, x0, y2, padding_mode, align_corner);
+                            float v21 = get_value_bounded(image, x1, y2, padding_mode, align_corner);
+                            float v22 = get_value_bounded(image, x2, y2, padding_mode, align_corner);
+                            float v23 = get_value_bounded(image, x3, y2, padding_mode, align_corner);
+                            float v30 = get_value_bounded(image, x0, y3, padding_mode, align_corner);
+                            float v31 = get_value_bounded(image, x1, y3, padding_mode, align_corner);
+                            float v32 = get_value_bounded(image, x2, y3, padding_mode, align_corner);
+                            float v33 = get_value_bounded(image, x3, y3, padding_mode, align_corner);
+
+                            float x_coeffs[4];
+                            float y_coeffs[4];
+                            interpolate_cubic(sample_x - x1, x_coeffs);
+                            interpolate_cubic(sample_y - y1, y_coeffs);
+
+                            float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3];
+                            float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3];
+                            float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3];
+                            float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3];
+
+                            v = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3];
+                        }
+
+                        outptr[0] = v;
+                        outptr += 1;
+
+                        gridptr += 2;
+                    }
+                }
+            }
+        }
+    }
+
+    if (dims == 4)
+    {
+        int outw = grid.h;
+        int outh = grid.d;
+        int outd = grid.c;
+
+        top_blob.create(outw, outh, outd, channels, elemsize, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (sample_type == 1) // bilinear
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat image = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int z = 0; z < outd; z++)
+                {
+                    const float* gridptr = grid.channel(z);
+
+                    for (int y = 0; y < outh; y++)
+                    {
+                        for (int x = 0; x < outw; x++)
+                        {
+                            float sample_x = gridptr[0];
+                            float sample_y = gridptr[1];
+                            float sample_z = gridptr[2];
+
+                            sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                            sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+                            sample_z = grid_sample_unormalize(d, sample_z, align_corner);
+
+                            // bilinear interpolate
+                            float v;
+                            {
+                                int x0 = (int)floor(sample_x);
+                                int y0 = (int)floor(sample_y);
+                                int z0 = (int)floor(sample_z);
+                                int x1 = x0 + 1;
+                                int y1 = y0 + 1;
+                                int z1 = z0 + 1;
+
+                                float v000 = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner);
+                                float v001 = get_value_bounded(image, x1, y0, z0, padding_mode, align_corner);
+                                float v010 = get_value_bounded(image, x0, y1, z0, padding_mode, align_corner);
+                                float v011 = get_value_bounded(image, x1, y1, z0, padding_mode, align_corner);
+                                float v100 = get_value_bounded(image, x0, y0, z1, padding_mode, align_corner);
+                                float v101 = get_value_bounded(image, x1, y0, z1, padding_mode, align_corner);
+                                float v110 = get_value_bounded(image, x0, y1, z1, padding_mode, align_corner);
+                                float v111 = get_value_bounded(image, x1, y1, z1, padding_mode, align_corner);
+
+                                float alpha = sample_x - x0;
+                                float beta = sample_y - y0;
+                                float gamma = sample_z - z0;
+
+                                float v00 = v000 * (1 - alpha) + v001 * alpha;
+                                float v01 = v010 * (1 - alpha) + v011 * alpha;
+                                float v10 = v100 * (1 - alpha) + v101 * alpha;
+                                float v11 = v110 * (1 - alpha) + v111 * alpha;
+
+                                float v0 = v00 * (1 - beta) + v01 * beta;
+                                float v1 = v10 * (1 - beta) + v11 * beta;
+
+                                v = v0 * (1 - gamma) + v1 * gamma;
+                            }
+
+                            outptr[0] = v;
+                            outptr += 1;
+
+                            gridptr += 3;
+                        }
+                    }
+                }
+            }
+        }
+        else if (sample_type == 2) // nearest
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat image = bottom_blob.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int z = 0; z < outd; z++)
+                {
+                    const float* gridptr = grid.channel(z);
+
+                    for (int y = 0; y < outh; y++)
+                    {
+                        for (int x = 0; x < outw; x++)
+                        {
+                            float sample_x = gridptr[0];
+                            float sample_y = gridptr[1];
+                            float sample_z = gridptr[2];
+
+                            sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                            sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+                            sample_z = grid_sample_unormalize(d, sample_z, align_corner);
+
+                            int x0 = static_cast<int>(round(sample_x));
+                            int y0 = static_cast<int>(round(sample_y));
+                            int z0 = static_cast<int>(round(sample_z));
+
+                            float v = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner);
+
+                            outptr[0] = v;
+                            outptr += 1;
+
+                            gridptr += 3;
+                        }
+                    }
+                }
+            }
+        }
+        else if (sample_type == 3)
+        {
+            NCNN_LOGE("unsupported bicubic when dims == 4");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h
new file mode 100644
index 00000000000..0ea540eb4ba
--- /dev/null
+++ b/src/layer/gridsample.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_GRIDSAMPLE_H
+#define LAYER_GRIDSAMPLE_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class GridSample : public Layer
+{
+public:
+    GridSample();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    // param
+    int sample_type;  // 1=bilinear  2=nearest  3=bicubic
+    int padding_mode; // 1=zeros     2=border   3=reflection
+    int align_corner;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GRIDSAMPLE_H
diff --git a/src/layer/groupnorm.cpp b/src/layer/groupnorm.cpp
index 81847d57319..596d3974308 100644
--- a/src/layer/groupnorm.cpp
+++ b/src/layer/groupnorm.cpp
@@ -52,66 +52,180 @@ int GroupNorm::load_model(const ModelBin& mb)
 
 int GroupNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-    // x = (x - mean) / sqrt(var + eps) * gamma + beta
+    const int dims = bottom_top_blob.dims;
+    const int channels_per_group = channels / group;
 
-    int w = bottom_top_blob.w;
-    int h = bottom_top_blob.h;
-    int size = w * h;
-
-    int channels_per_group = channels / group;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int g = 0; g < group; g++)
+    if (dims == 1)
     {
-        Mat bottom_top_blob_g = bottom_top_blob.channel_range(g * channels_per_group, channels_per_group);
-
-        // mean and var
-        float sum = 0.f;
-        for (int q = 0; q < channels_per_group; q++)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int g = 0; g < group; g++)
         {
-            const float* ptr = bottom_top_blob_g.channel(q);
-            for (int i = 0; i < size; i++)
+            Mat bottom_top_blob_g = bottom_top_blob.range(g * channels_per_group, channels_per_group);
+            const Mat gamma_data_g = gamma_data.range(g * channels_per_group, channels_per_group);
+            const Mat beta_data_g = beta_data.range(g * channels_per_group, channels_per_group);
+
+            // mean and var
+            float sum = 0.f;
+            for (int q = 0; q < channels_per_group; q++)
             {
-                sum += ptr[i];
+                sum += bottom_top_blob_g[q];
             }
-        }
-        float mean = sum / (channels_per_group * size);
+            float mean = sum / channels_per_group;
 
-        float sqsum = 0.f;
-        for (int q = 0; q < channels_per_group; q++)
-        {
-            const float* ptr = bottom_top_blob_g.channel(q);
-            for (int i = 0; i < size; i++)
+            float sqsum = 0.f;
+            for (int q = 0; q < channels_per_group; q++)
             {
-                float tmp = ptr[i] - mean;
+                float tmp = bottom_top_blob_g[q] - mean;
                 sqsum += tmp * tmp;
             }
+            float var = sqsum / channels_per_group;
+
+            for (int q = 0; q < channels_per_group; q++)
+            {
+                float a;
+                float b;
+                if (affine)
+                {
+                    float gamma = gamma_data_g[q];
+                    float beta = beta_data_g[q];
+
+                    a = (float)(gamma / sqrt(var + eps));
+                    b = -mean * a + beta;
+                }
+                else
+                {
+                    a = (float)(1.f / (sqrt(var + eps)));
+                    b = -mean * a;
+                }
+
+                bottom_top_blob_g[q] = bottom_top_blob_g[q] * a + b;
+            }
         }
-        float var = sqsum / (channels_per_group * size);
+    }
 
-        for (int q = 0; q < channels_per_group; q++)
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int g = 0; g < group; g++)
         {
-            float a;
-            float b;
-            if (affine)
+            Mat bottom_top_blob_g = bottom_top_blob.row_range(g * channels_per_group, channels_per_group);
+            const Mat gamma_data_g = gamma_data.range(g * channels_per_group, channels_per_group);
+            const Mat beta_data_g = beta_data.range(g * channels_per_group, channels_per_group);
+
+            // mean and var
+            float sum = 0.f;
+            for (int q = 0; q < channels_per_group; q++)
             {
-                float gamma = gamma_data[g * channels_per_group + q];
-                float beta = beta_data[g * channels_per_group + q];
+                const float* ptr = bottom_top_blob_g.row(q);
+                for (int i = 0; i < w; i++)
+                {
+                    sum += ptr[i];
+                }
+            }
+            float mean = sum / (channels_per_group * w);
 
-                a = static_cast<float>(gamma / sqrt(var + eps));
-                b = -mean * a + beta;
+            float sqsum = 0.f;
+            for (int q = 0; q < channels_per_group; q++)
+            {
+                const float* ptr = bottom_top_blob_g.row(q);
+                for (int i = 0; i < w; i++)
+                {
+                    float tmp = ptr[i] - mean;
+                    sqsum += tmp * tmp;
+                }
             }
-            else
+            float var = sqsum / (channels_per_group * w);
+
+            for (int q = 0; q < channels_per_group; q++)
             {
-                a = static_cast<float>(1.f / (sqrt(var + eps)));
-                b = -mean * a;
+                float a;
+                float b;
+                if (affine)
+                {
+                    float gamma = gamma_data_g[q];
+                    float beta = beta_data_g[q];
+
+                    a = (float)(gamma / sqrt(var + eps));
+                    b = -mean * a + beta;
+                }
+                else
+                {
+                    a = (float)(1.f / (sqrt(var + eps)));
+                    b = -mean * a;
+                }
+
+                float* ptr = bottom_top_blob_g.row(q);
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = ptr[i] * a + b;
+                }
             }
+        }
+    }
 
-            float* ptr = bottom_top_blob_g.channel(q);
+    if (dims == 3 || dims == 4)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int d = bottom_top_blob.d;
+        int size = w * h * d;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int g = 0; g < group; g++)
+        {
+            Mat bottom_top_blob_g = bottom_top_blob.channel_range(g * channels_per_group, channels_per_group);
+            const Mat gamma_data_g = gamma_data.range(g * channels_per_group, channels_per_group);
+            const Mat beta_data_g = beta_data.range(g * channels_per_group, channels_per_group);
+
+            // mean and var
+            float sum = 0.f;
+            for (int q = 0; q < channels_per_group; q++)
+            {
+                const float* ptr = bottom_top_blob_g.channel(q);
+                for (int i = 0; i < size; i++)
+                {
+                    sum += ptr[i];
+                }
+            }
+            float mean = sum / (channels_per_group * size);
+
+            float sqsum = 0.f;
+            for (int q = 0; q < channels_per_group; q++)
+            {
+                const float* ptr = bottom_top_blob_g.channel(q);
+                for (int i = 0; i < size; i++)
+                {
+                    float tmp = ptr[i] - mean;
+                    sqsum += tmp * tmp;
+                }
+            }
+            float var = sqsum / (channels_per_group * size);
 
-            for (int i = 0; i < size; i++)
+            for (int q = 0; q < channels_per_group; q++)
             {
-                ptr[i] = ptr[i] * a + b;
+                float a;
+                float b;
+                if (affine)
+                {
+                    float gamma = gamma_data_g[q];
+                    float beta = beta_data_g[q];
+
+                    a = (float)(gamma / sqrt(var + eps));
+                    b = -mean * a + beta;
+                }
+                else
+                {
+                    a = (float)(1.f / (sqrt(var + eps)));
+                    b = -mean * a;
+                }
+
+                float* ptr = bottom_top_blob_g.channel(q);
+                for (int i = 0; i < size; i++)
+                {
+                    ptr[i] = ptr[i] * a + b;
+                }
             }
         }
     }
diff --git a/src/layer/loongarch/absval_loongarch.cpp b/src/layer/loongarch/absval_loongarch.cpp
new file mode 100644
index 00000000000..ea60b01eaf0
--- /dev/null
+++ b/src/layer/loongarch/absval_loongarch.cpp
@@ -0,0 +1,67 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "absval_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+AbsVal_loongarch::AbsVal_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128i _p = __lsx_vld(ptr, 0);
+            __m128i _outp = __lsx_vbitclri_w(_p, 31);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr > 0 ? *ptr : -*ptr;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h
new file mode 100644
index 00000000000..0a3143cea43
--- /dev/null
+++ b/src/layer/loongarch/absval_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ABSVAL_LOONGARCH_H
+#define LAYER_ABSVAL_LOONGARCH_H
+
+#include "absval.h"
+
+namespace ncnn {
+
+class AbsVal_loongarch : virtual public AbsVal
+{
+public:
+    AbsVal_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ABSVAL_LOONGARCH_H
diff --git a/src/layer/loongarch/batchnorm_loongarch.cpp b/src/layer/loongarch/batchnorm_loongarch.cpp
new file mode 100644
index 00000000000..f0e33b78efd
--- /dev/null
+++ b/src/layer/loongarch/batchnorm_loongarch.cpp
@@ -0,0 +1,145 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+BatchNorm_loongarch::BatchNorm_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        int w = bottom_top_blob.w * elempack;
+
+#if __loongarch_sx
+        int nn_w = w / 4;
+        int remain_w_start = nn_w * 4;
+#else
+        int remain_w_start = 0;
+#endif // __loongarch_sx
+
+        float* ptr = bottom_top_blob;
+
+#if __loongarch_sx
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < nn_w; i++)
+        {
+            float* ptr0 = ptr + i * 4;
+
+            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+            __m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0);
+            __m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0);
+            _p = __lsx_vfmadd_s(_b, _p, _a);
+            __lsx_vst(_p, ptr0, 0);
+        }
+#endif // __loongarch_sx
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_w_start; i < w; i++)
+        {
+            ptr[i] = b_data[i] * ptr[i] + a_data[i];
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w * elempack;
+        int h = bottom_top_blob.h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            float a = a_data[i];
+            float b = b_data[i];
+
+            int j = 0;
+#if __loongarch_sx
+            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
+            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
+            for (; j + 3 < w; j += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmadd_s(_b, _p, _a);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; j < w; j++)
+            {
+                *ptr = b * *ptr + a;
+                ptr++;
+            }
+        }
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int d = bottom_top_blob.d;
+        int c = bottom_top_blob.c;
+        int size = w * h * d * elempack;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float a = a_data[q];
+            float b = b_data[q];
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
+            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmadd_s(_b, _p, _a);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *ptr = b * *ptr + a;
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/batchnorm_loongarch.h b/src/layer/loongarch/batchnorm_loongarch.h
new file mode 100644
index 00000000000..8b38d5e1f66
--- /dev/null
+++ b/src/layer/loongarch/batchnorm_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BATCHNORM_LOONGARCH_H
+#define LAYER_BATCHNORM_LOONGARCH_H
+
+#include "batchnorm.h"
+
+namespace ncnn {
+
+class BatchNorm_loongarch : virtual public BatchNorm
+{
+public:
+    BatchNorm_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BATCHNORM_LOONGARCH_H
diff --git a/src/layer/loongarch/bias_loongarch.cpp b/src/layer/loongarch/bias_loongarch.cpp
new file mode 100644
index 00000000000..74129a8d328
--- /dev/null
+++ b/src/layer/loongarch/bias_loongarch.cpp
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "bias_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+int Bias_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int size = w * h * d;
+
+    const float* bias_ptr = bias_data;
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        float bias = bias_ptr[q];
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias);
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = __lsx_vfadd_s(_p, _bias);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *ptr = *ptr + bias;
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/bias_loongarch.h b/src/layer/loongarch/bias_loongarch.h
new file mode 100644
index 00000000000..f122ffa0dd9
--- /dev/null
+++ b/src/layer/loongarch/bias_loongarch.h
@@ -0,0 +1,30 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BIAS_LOONGARCH_H
+#define LAYER_BIAS_LOONGARCH_H
+
+#include "bias.h"
+
+namespace ncnn {
+
+class Bias_loongarch : virtual public Bias
+{
+public:
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BIAS_LOONGARCH_H
diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp
new file mode 100644
index 00000000000..7832c9ca732
--- /dev/null
+++ b/src/layer/loongarch/binaryop_loongarch.cpp
@@ -0,0 +1,1066 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "binaryop_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+BinaryOp_loongarch::BinaryOp_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+template<typename Op>
+static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = b.w;
+    int h = b.h;
+    int d = b.d;
+    int channels = b.c;
+    int elempack = b.elempack;
+    int size = w * h * d * elempack;
+
+    // type 2 3 4 20
+    c.create_like(b, opt.blob_allocator);
+    if (c.empty())
+        return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float a0 = a[0];
+        const float* ptr = b.channel(q);
+        float* outptr = c.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _a0 = __lsx_vreplfr2vr_s(a0);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = op(_a0, _p);
+            __lsx_vst(_outp, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *outptr = op(a0, *ptr);
+            ptr += 1;
+            outptr += 1;
+        }
+    }
+
+    return 0;
+}
+
+template<typename Op>
+static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    // type 6 11 16 25
+    c.create_like(a, opt.blob_allocator);
+    if (c.empty())
+        return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = a.channel(q);
+        const float b0 = b[0];
+        float* outptr = c.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _b0 = __lsx_vreplfr2vr_s(b0);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = op(_p, _b0);
+            __lsx_vst(_outp, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *outptr = op(*ptr, b0);
+            ptr += 1;
+            outptr += 1;
+        }
+    }
+
+    return 0;
+}
+
+template<typename Op>
+static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    // type 7 13 19 29
+    c.create_like(a, opt.blob_allocator);
+    if (c.empty())
+        return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = a.channel(q);
+        const float* ptr1 = b.channel(q);
+        float* outptr = c.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __builtin_prefetch(ptr1 + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+            __m128 _outp = op(_p, _p1);
+            __lsx_vst(_outp, outptr, 0);
+            ptr += 4;
+            ptr1 += 4;
+            outptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *outptr = op(*ptr, *ptr1);
+            ptr += 1;
+            ptr1 += 1;
+            outptr += 1;
+        }
+    }
+
+    return 0;
+}
+
+#if __loongarch_sx
+// broadcasting rule
+// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting
+
+template<typename Op>
+static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int size = w * h * d;
+    size_t elemsize = a.elemsize;
+    int elempack = a.elempack;
+
+    int w1 = b.w;
+    int h1 = b.h;
+    int d1 = b.d;
+    int channels1 = b.c;
+    int size1 = w1 * h1 * d1;
+    size_t elemsize1 = b.elemsize;
+    int elempack1 = b.elempack;
+
+    if (a.dims == 4)
+    {
+        if (b.dims == 4)
+        {
+            // type 29
+            return binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+
+        c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
+        if (c.empty())
+            return -100;
+
+        if (b.dims == 3)
+        {
+            // type 28
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    for (int y = 0; y < h; y++)
+                    {
+                        __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _outp = op(_p, _b0);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr += 4;
+                            outptr += 4;
+                        }
+
+                        ptr1 += 4;
+                    }
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 2)
+        {
+            // type 27
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.row(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                    for (int y = 0; y < h; y++)
+                    {
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _outp = op(_p, _b0);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr += 4;
+                            outptr += 4;
+                        }
+                    }
+
+                    ptr1 += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 1)
+        {
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 25
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 26
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size; i++)
+                {
+                    __builtin_prefetch(ptr + 16);
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _outp = op(_p, _b0);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+    }
+    else if (a.dims == 3)
+    {
+        if (b.dims == 4)
+        {
+            // type 23
+            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d1; z++)
+                {
+                    for (int y = 0; y < h1; y++)
+                    {
+                        __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_a0, _p);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+
+                        ptr += 4;
+                    }
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 3)
+        {
+            if (w1 == 1 && h1 == 1 && channels1 == channels)
+            {
+                // special type 1
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* b0 = b.channel(q);
+                    float* outptr = c.channel(q);
+                    __m128 _b0 = (__m128)__lsx_vld(b0, 0);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                        __m128 _outp = op(_p, _b0);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 4;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1)
+            {
+                // special type 2
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b;
+                    float* outptr = c.channel(q);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                        __m128 _p1 = __lsx_vreplfr2vr_s(ptr1[0]);
+                        __m128 _outp = op(_p, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 4;
+                        ptr1 += 1;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w == 1 && h == 1 && channels1 == channels)
+            {
+                // special type 3
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* a0 = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+                    __m128 _a0 = (__m128)__lsx_vld(a0, 0);
+                    for (int i = 0; i < size1; i++)
+                    {
+                        __builtin_prefetch(ptr1 + 16);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                        __m128 _outp = op(_a0, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr1 += 4;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h1 == h && channels == 1 && elempack == 1)
+            {
+                // special type 4
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a;
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+                    for (int i = 0; i < size1; i++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __builtin_prefetch(ptr1 + 16);
+                        __m128 _p = __lsx_vreplfr2vr_s(ptr[0]);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                        __m128 _outp = op(_p, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 1;
+                        ptr1 += 4;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w != 1 && w1 == 1 && h1 == h && channels1 == channels)
+            {
+                // special type 5
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h; y++)
+                    {
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1 + y * 4, 0);
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h != 1 && h1 == 1 && channels1 == channels)
+            {
+                // special type 6
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h; y++)
+                    {
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _p1 = (__m128)__lsx_vld(ptr1 + x * 4, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 != 1 && w == 1 && h1 == h && channels1 == channels)
+            {
+                // special type 7
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h1; y++)
+                    {
+                        __m128 _p = (__m128)__lsx_vld(ptr + y * 4, 0);
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h1 != 1 && h == 1 && channels1 == channels)
+            {
+                // special type 8
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h1; y++)
+                    {
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr + x * 4, 0);
+                            __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            // type 19
+            return binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+
+        c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+        if (c.empty())
+            return -100;
+
+        if (b.dims == 2)
+        {
+            // type 18
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.row(q);
+                float* outptr = c.channel(q);
+
+                for (int y = 0; y < h; y++)
+                {
+                    __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                    for (int x = 0; x < w; x++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                        __m128 _outp = op(_p, _b0);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 4;
+                        outptr += 4;
+                    }
+
+                    ptr1 += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 1)
+        {
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 16
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 17
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size; i++)
+                {
+                    __builtin_prefetch(ptr + 16);
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _outp = op(_p, _b0);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+    }
+    else if (a.dims == 2)
+    {
+        if (b.dims == 4)
+        {
+            // type 22
+            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                const float* ptr = a.row(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d1; z++)
+                {
+                    __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                    for (int y = 0; y < h1; y++)
+                    {
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_a0, _p);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+                    }
+
+                    ptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 3)
+        {
+            // type 14
+            c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                const float* ptr = a.row(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int y = 0; y < h1; y++)
+                {
+                    __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                    for (int x = 0; x < w1; x++)
+                    {
+                        __builtin_prefetch(ptr1 + 16);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                        __m128 _outp = op(_a0, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr1 += 4;
+                        outptr += 4;
+                    }
+
+                    ptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        c.create(w, h, elemsize, elempack, opt.blob_allocator);
+        if (c.empty())
+            return -100;
+
+        if (b.dims == 2)
+        {
+            // type 13
+            return binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+
+        if (b.dims == 1)
+        {
+            c.create(w, h, elemsize, elempack, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 11
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 12
+            const float* ptr = a;
+            const float* ptr1 = b;
+            float* outptr = c;
+
+            for (int y = 0; y < h; y++)
+            {
+                __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                for (int x = 0; x < w; x++)
+                {
+                    __builtin_prefetch(ptr + 16);
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _outp = op(_p, _b0);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+
+                ptr1 += 4;
+            }
+
+            return 0;
+        }
+    }
+    else if (a.dims == 1)
+    {
+        if (a.w == 1 && elempack == 1)
+        {
+            // type 2 3 4 20
+            return binary_op_2_3_4_20<Op>(a, b, c, opt);
+        }
+
+        if (b.dims == 4)
+        {
+            // type 21
+            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size1; i++)
+                {
+                    __builtin_prefetch(ptr1 + 16);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    __m128 _outp = op(_a0, _p1);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 3)
+        {
+            // type 9
+            c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size1; i++)
+                {
+                    __builtin_prefetch(ptr1 + 16);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    __m128 _outp = op(_a0, _p1);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 2)
+        {
+            // type 8
+            c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            const float* ptr = a;
+            const float* ptr1 = b;
+            float* outptr = c;
+
+            for (int y = 0; y < h1; y++)
+            {
+                __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                for (int x = 0; x < w1; x++)
+                {
+                    __builtin_prefetch(ptr1 + 16);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    __m128 _outp = op(_a0, _p1);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+
+                ptr += 4;
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 1)
+        {
+            c.create(w, elemsize, elempack, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 6
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 7
+            binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+    }
+
+    return 0;
+}
+#endif // __loongarch_sx
+
+template<typename Op>
+static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = a.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _b = __lsx_vreplfr2vr_s(b);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = op(_p, _b);
+            __lsx_vst(_p, ptr, 0);
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = op(*ptr, b);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+namespace BinaryOp_loongarch_functor {
+
+#if __loongarch_sx
+#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                          \
+    struct NAME                                                   \
+    {                                                             \
+        float operator()(const float& x, const float& y) const    \
+        {                                                         \
+            return IMPL;                                          \
+        }                                                         \
+        __m128 operator()(const __m128& x, const __m128& y) const \
+        {                                                         \
+            return IMPL4;                                         \
+        }                                                         \
+    };
+#else
+#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                       \
+    struct NAME                                                \
+    {                                                          \
+        float operator()(const float& x, const float& y) const \
+        {                                                      \
+            return IMPL;                                       \
+        }                                                      \
+    };
+#endif // __loongarch_sx
+
+// clang-format off
+// *INDENT-OFF*
+MAKE_FUNCTION(binary_op_add, x + y, __lsx_vfadd_s(x, y))
+MAKE_FUNCTION(binary_op_sub, x - y, __lsx_vfsub_s(x, y))
+MAKE_FUNCTION(binary_op_mul, x * y, __lsx_vfmul_s(x, y))
+MAKE_FUNCTION(binary_op_div, x / y, __lsx_vfdiv_s(x, y))
+MAKE_FUNCTION(binary_op_max, std::max(x, y), __lsx_vfmax_s(x, y))
+MAKE_FUNCTION(binary_op_min, std::min(x, y), __lsx_vfmin_s(x, y))
+MAKE_FUNCTION(binary_op_pow, (float)pow(x, y), pow_ps(x, y))
+MAKE_FUNCTION(binary_op_rsub, y - x, __lsx_vfsub_s(y, x))
+MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x))
+// *INDENT-ON*
+// clang-format on
+
+#undef MAKE_FUNCTION
+
+} // namespace BinaryOp_loongarch_functor
+
+int BinaryOp_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+#if __loongarch_sx
+    using namespace BinaryOp_loongarch_functor;
+
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& bottom_blob1 = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int elempack = bottom_blob.elempack;
+    int elempack1 = bottom_blob1.elempack;
+
+    if (elempack == 4 || elempack1 == 4)
+    {
+        if (op_type == Operation_ADD)
+            return binary_op_pack4<binary_op_add>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_SUB)
+            return binary_op_pack4<binary_op_sub>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_MUL)
+            return binary_op_pack4<binary_op_mul>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_DIV)
+            return binary_op_pack4<binary_op_div>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_MAX)
+            return binary_op_pack4<binary_op_max>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_MIN)
+            return binary_op_pack4<binary_op_min>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_POW)
+            return binary_op_pack4<binary_op_pow>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_RSUB)
+            return binary_op_pack4<binary_op_sub>(bottom_blob1, bottom_blob, top_blob, opt);
+
+        if (op_type == Operation_RDIV)
+            return binary_op_pack4<binary_op_div>(bottom_blob1, bottom_blob, top_blob, opt);
+    }
+#endif // __loongarch_sx
+
+    return BinaryOp::forward(bottom_blobs, top_blobs, opt);
+}
+
+int BinaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    using namespace BinaryOp_loongarch_functor;
+
+    if (op_type == Operation_ADD)
+        return binary_op_scalar_inplace<binary_op_add>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_SUB)
+        return binary_op_scalar_inplace<binary_op_sub>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_MUL)
+        return binary_op_scalar_inplace<binary_op_mul>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_DIV)
+        return binary_op_scalar_inplace<binary_op_div>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_MAX)
+        return binary_op_scalar_inplace<binary_op_max>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_MIN)
+        return binary_op_scalar_inplace<binary_op_min>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_POW)
+        return binary_op_scalar_inplace<binary_op_pow>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_RSUB)
+        return binary_op_scalar_inplace<binary_op_rsub>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_RDIV)
+        return binary_op_scalar_inplace<binary_op_rdiv>(bottom_top_blob, b, opt);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/binaryop_loongarch.h b/src/layer/loongarch/binaryop_loongarch.h
new file mode 100644
index 00000000000..bcf9ef5442f
--- /dev/null
+++ b/src/layer/loongarch/binaryop_loongarch.h
@@ -0,0 +1,34 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BINARYOP_LOONGARCH_H
+#define LAYER_BINARYOP_LOONGARCH_H
+
+#include "binaryop.h"
+
+namespace ncnn {
+
+class BinaryOp_loongarch : virtual public BinaryOp
+{
+public:
+    BinaryOp_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BINARYOP_LOONGARCH_H
diff --git a/src/layer/loongarch/cast_loongarch.cpp b/src/layer/loongarch/cast_loongarch.cpp
new file mode 100644
index 00000000000..2e956657f14
--- /dev/null
+++ b/src/layer/loongarch/cast_loongarch.cpp
@@ -0,0 +1,209 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cast_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Cast_loongarch::Cast_loongarch()
+{
+    support_packing = true;
+}
+
+int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (type_from == type_to)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    size_t out_elemsize = elemsize;
+    if (type_to == 1)
+    {
+        if (type_from == 3)
+        {
+            Cast::forward(bottom_blob, top_blob, opt);
+        }
+
+        // float32
+        out_elemsize = 4 * elempack;
+    }
+    else if (type_to == 2)
+    {
+        // float16
+        out_elemsize = 2 * elempack;
+    }
+    else if (type_to == 3)
+    {
+        // int8
+        out_elemsize = elempack;
+    }
+    else if (type_to == 4)
+    {
+        // bfloat16
+        out_elemsize = 2 * elempack;
+    }
+
+    if (dims == 1)
+    {
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 2)
+    {
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 3)
+    {
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 4)
+    {
+        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    int size = w * h * d * elempack;
+
+    if (type_from == 1 && type_to == 2)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            unsigned short* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 7 < size; i += 8)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p0 = (__m128)__lsx_vld(ptr, 0);
+                __m128 _p1 = (__m128)__lsx_vld(ptr + 4, 0);
+                __m128i _p = __lsx_vfcvt_h_s(_p1, _p0);
+                __lsx_vst(_p, outptr, 0);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = float32_to_float16(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    if (type_from == 2 && type_to == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const unsigned short* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 7 < size; i += 8)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128i _p = __lsx_vld(ptr, 0);
+                __m128 _p0 = __lsx_vfcvtl_s_h(_p);
+                __m128 _p1 = __lsx_vfcvth_s_h(_p);
+                __lsx_vst(_p0, outptr, 0);
+                __lsx_vst(_p1, outptr + 4, 0);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = float16_to_float32(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    if (type_from == 3 && type_to == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const signed char* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 4 && type_to == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const unsigned short* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+            for (; i < size; i++)
+            {
+                *outptr = bfloat16_to_float32(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            unsigned short* outptr = top_blob.channel(q);
+
+            int i = 0;
+            for (; i < size; i++)
+            {
+                *outptr = float32_to_bfloat16(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/cast_loongarch.h b/src/layer/loongarch/cast_loongarch.h
new file mode 100644
index 00000000000..1fe75c687d8
--- /dev/null
+++ b/src/layer/loongarch/cast_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CAST_LOONGARCH_H
+#define LAYER_CAST_LOONGARCH_H
+
+#include "cast.h"
+
+namespace ncnn {
+
+class Cast_loongarch : virtual public Cast
+{
+public:
+    Cast_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CAST_LOONGARCH_H
diff --git a/src/layer/loongarch/clip_loongarch.cpp b/src/layer/loongarch/clip_loongarch.cpp
new file mode 100644
index 00000000000..7cf0246d060
--- /dev/null
+++ b/src/layer/loongarch/clip_loongarch.cpp
@@ -0,0 +1,76 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "clip_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Clip_loongarch::Clip_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Clip_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _max = (__m128)__lsx_vreplfr2vr_s(max);
+        __m128 _min = (__m128)__lsx_vreplfr2vr_s(min);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmax_s(_p, _min);
+            _p = __lsx_vfmin_s(_p, _max);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < min)
+                *ptr = min;
+
+            if (*ptr > max)
+                *ptr = max;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/clip_loongarch.h b/src/layer/loongarch/clip_loongarch.h
new file mode 100644
index 00000000000..43df62035ff
--- /dev/null
+++ b/src/layer/loongarch/clip_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CLIP_LOONGARCH_H
+#define LAYER_CLIP_LOONGARCH_H
+
+#include "clip.h"
+
+namespace ncnn {
+
+class Clip_loongarch : virtual public Clip
+{
+public:
+    Clip_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CLIP_LOONGARCH_H
diff --git a/src/layer/loongarch/concat_loongarch.cpp b/src/layer/loongarch/concat_loongarch.cpp
new file mode 100644
index 00000000000..50460f8c134
--- /dev/null
+++ b/src/layer/loongarch/concat_loongarch.cpp
@@ -0,0 +1,348 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "concat_loongarch.h"
+
+namespace ncnn {
+
+Concat_loongarch::Concat_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Concat_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    int dims = bottom_blobs[0].dims;
+    int positive_axis = axis < 0 ? dims + axis : axis;
+
+    if (dims == 1) // positive_axis == 0
+    {
+        // concat vector
+        // total length
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        float* outptr = top_blob;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+
+            const float* ptr = bottom_blob;
+            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
+
+            outptr += bottom_blob.w * bottom_blob.elempack;
+        }
+    }
+
+    if (dims == 2 && positive_axis == 0)
+    {
+        // concat image
+        int w = bottom_blobs[0].w;
+
+        // total height
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_h = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            elemsize = std::min(elemsize, bottom_blob.elemsize);
+            elempack = std::min(elempack, bottom_blob.elempack);
+            top_h += bottom_blob.h * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        Mat top_blob_unpacked = top_blob;
+        if (elempack < out_elempack)
+        {
+            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
+            if (top_blob_unpacked.empty())
+                return -100;
+        }
+
+        float* outptr = top_blob_unpacked;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+
+            if (bottom_blob.elempack == 4 && elempack == 1)
+            {
+                for (int i = 0; i < bottom_blob.h; i++)
+                {
+                    const float* r0 = bottom_blob.row(i);
+
+                    float* outptr0 = outptr;
+                    float* outptr1 = outptr + w;
+                    float* outptr2 = outptr + w * 2;
+                    float* outptr3 = outptr + w * 3;
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        *outptr0++ = r0[0];
+                        *outptr1++ = r0[1];
+                        *outptr2++ = r0[2];
+                        *outptr3++ = r0[3];
+
+                        r0 += 4;
+                    }
+
+                    outptr += w * 4;
+                }
+            }
+            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
+            {
+                int size = w * bottom_blob.h;
+
+                const float* ptr = bottom_blob;
+                memcpy(outptr, ptr, size * bottom_blob.elemsize);
+
+                outptr += size * bottom_blob.elempack;
+            }
+        }
+
+        // packing
+        if (elempack < out_elempack)
+        {
+            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+        }
+    }
+
+    if (dims == 2 && positive_axis == 1)
+    {
+        // interleave image row
+        int h = bottom_blobs[0].h;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total width
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w;
+        }
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* outptr = top_blob.row(i);
+            for (size_t b = 0; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob = bottom_blobs[b];
+
+                const float* ptr = bottom_blob.row(i);
+                memcpy(outptr, ptr, bottom_blob.w * elemsize);
+
+                outptr += bottom_blob.w * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 0)
+    {
+        // concat dim
+        int w = bottom_blobs[0].w;
+        int h = bottom_blobs[0].h;
+
+        // total channels
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_channels = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            elemsize = std::min(elemsize, bottom_blob.elemsize);
+            elempack = std::min(elempack, bottom_blob.elempack);
+            top_channels += bottom_blob.c * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        Mat top_blob_unpacked = top_blob;
+        if (elempack < out_elempack)
+        {
+            top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
+            if (top_blob_unpacked.empty())
+                return -100;
+        }
+
+        int p = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+
+            if (bottom_blob.elempack == 4 && elempack == 1)
+            {
+                int size = bottom_blob.w * bottom_blob.h;
+
+                for (int q = 0; q < bottom_blob.c; q++)
+                {
+                    const float* r0 = bottom_blob.channel(q);
+
+                    float* outptr0 = top_blob_unpacked.channel(p);
+                    float* outptr1 = top_blob_unpacked.channel(p + 1);
+                    float* outptr2 = top_blob_unpacked.channel(p + 2);
+                    float* outptr3 = top_blob_unpacked.channel(p + 3);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        *outptr0++ = r0[0];
+                        *outptr1++ = r0[1];
+                        *outptr2++ = r0[2];
+                        *outptr3++ = r0[3];
+
+                        r0 += 4;
+                    }
+
+                    p += 4;
+                }
+            }
+            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
+            {
+                int size = bottom_blob.total();
+
+                const float* ptr = bottom_blob;
+                float* outptr = top_blob_unpacked.channel(p);
+                memcpy(outptr, ptr, size * bottom_blob.elemsize);
+
+                p += bottom_blob.c;
+            }
+        }
+
+        // packing
+        if (elempack < out_elempack)
+        {
+            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+        }
+    }
+
+    if (dims == 3 && positive_axis == 1)
+    {
+        // interleave dim height
+        int w = bottom_blobs[0].w;
+        int channels = bottom_blobs[0].c;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total height
+        int top_h = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_h += bottom_blob.h;
+        }
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* outptr = top_blob.channel(q);
+
+            for (size_t b = 0; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob = bottom_blobs[b];
+
+                int size = bottom_blob.w * bottom_blob.h;
+
+                const float* ptr = bottom_blob.channel(q);
+                memcpy(outptr, ptr, size * elemsize);
+
+                outptr += size * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 2)
+    {
+        // interleave dim width
+        int h = bottom_blobs[0].h;
+        int channels = bottom_blobs[0].c;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total height
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w;
+        }
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                for (size_t b = 0; b < bottom_blobs.size(); b++)
+                {
+                    const Mat& bottom_blob = bottom_blobs[b];
+
+                    const float* ptr = bottom_blob.channel(q).row(i);
+                    memcpy(outptr, ptr, bottom_blob.w * elemsize);
+
+                    outptr += bottom_blob.w * elempack;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/concat_loongarch.h b/src/layer/loongarch/concat_loongarch.h
new file mode 100644
index 00000000000..934c85244df
--- /dev/null
+++ b/src/layer/loongarch/concat_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONCAT_LOONGARCH_H
+#define LAYER_CONCAT_LOONGARCH_H
+
+#include "concat.h"
+
+namespace ncnn {
+
+class Concat_loongarch : virtual public Concat
+{
+public:
+    Concat_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONCAT_LOONGARCH_H
diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp
new file mode 100644
index 00000000000..0b1a11c868f
--- /dev/null
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -0,0 +1,379 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution1d_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Convolution1D_loongarch::Convolution1D_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Convolution1D_loongarch::create_pipeline(const Option& opt)
+{
+    if (dynamic_weight)
+        return 0;
+
+    const int num_input = weight_data_size / kernel_w / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    // src = kw-inch-outch
+    // dst = pb-pa-kw-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output);
+
+        weight_data_packed.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_packed.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < kernel_w; k++)
+                {
+                    for (int i = 0; i < elempack; i++)
+                    {
+                        for (int j = 0; j < out_elempack; j++)
+                        {
+                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Convolution1D_loongarch::destroy_pipeline(const Option& /*opt*/)
+{
+    return 0;
+}
+
+int Convolution1D_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+    const int outh = num_output / out_elempack;
+
+    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (elempack == 4 && out_elempack == 4)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    const float* kptr = weight_data_packed.channel(p);
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            __m128 _val0 = __lsx_vreplfr2vr_s(sptr[0]);
+                            __m128 _val1 = __lsx_vreplfr2vr_s(sptr[1]);
+                            __m128 _val2 = __lsx_vreplfr2vr_s(sptr[2]);
+                            __m128 _val3 = __lsx_vreplfr2vr_s(sptr[3]);
+
+                            __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                            __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                            __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                            __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+
+                            _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+                            _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
+                            _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
+                            _sum = __lsx_vfmadd_s(_w3, _val3, _sum);
+
+                            sptr += dilation_w * 4;
+                            kptr += 16;
+                        }
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    const float* kptr = weight_data_packed.channel(p);
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
+                            __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                            sptr += dilation_w;
+                            kptr += 4;
+                        }
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 1)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    const float* kptr = weight_data_packed.channel(p);
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                            sptr += dilation_w * 4;
+                            kptr += 4;
+                        }
+                    }
+
+                    sum += __lsx_reduce_fadd_s(_sum);
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[j] = sum;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    const float* kptr = (const float*)weight_data + kernel_w * h * p;
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            float val = sptr[0];
+                            float wt = kptr[0];
+                            sum += val * wt;
+
+                            sptr += dilation_w;
+                            kptr += 1;
+                        }
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[j] = sum;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Convolution1D_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _kernel_w = _weight_data.w;
+    const int _num_output = _weight_data.c * _weight_data.elempack;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(2, dilation_w);
+    pd.set(3, stride_w);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(18, pad_value);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_flattened.w);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_flattened;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/convolution1d_loongarch.h b/src/layer/loongarch/convolution1d_loongarch.h
new file mode 100644
index 00000000000..36393df4568
--- /dev/null
+++ b/src/layer/loongarch/convolution1d_loongarch.h
@@ -0,0 +1,41 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION1D_LOONGARCH_H
+#define LAYER_CONVOLUTION1D_LOONGARCH_H
+
+#include "convolution1d.h"
+
+namespace ncnn {
+
+class Convolution1D_loongarch : virtual public Convolution1D
+{
+public:
+    Convolution1D_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    // packn
+    Mat weight_data_packed;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION1D_LOONGARCH_H
diff --git a/src/layer/loongarch/convolution_1x1.h b/src/layer/loongarch/convolution_1x1.h
new file mode 100644
index 00000000000..83d3778411a
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1.h
@@ -0,0 +1,26 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_int8.h b/src/layer/loongarch/convolution_1x1_int8.h
new file mode 100644
index 00000000000..08f439c484a
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_int8.h
@@ -0,0 +1,83 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const signed char* r0 = bottom_blob.channel(p);
+        signed char* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 3 < outw; j += 4)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+                outptr[2] = r0[4];
+                outptr[3] = r0[6];
+
+                r0 += 8;
+                outptr += 4;
+            }
+            for (; j + 1 < outw; j += 2)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+
+                r0 += 4;
+                outptr += 2;
+            }
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack1to4_int8.h b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h
new file mode 100644
index 00000000000..00e1e258141
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h
@@ -0,0 +1,83 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const signed char* r0 = bottom_blob.channel(p);
+        signed char* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 3 < outw; j += 4)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+                outptr[2] = r0[4];
+                outptr[3] = r0[6];
+
+                r0 += 8;
+                outptr += 4;
+            }
+            for (; j + 1 < outw; j += 2)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+
+                r0 += 4;
+                outptr += 2;
+            }
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack4.h b/src/layer/loongarch/convolution_1x1_pack4.h
new file mode 100644
index 00000000000..cf5a5b8e363
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack4.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
+
+static void conv1x1s2_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const float* r0 = bottom_blob.channel(p);
+        float* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _val = (__m128)__lsx_vld(r0, 0);
+                __lsx_vst(_val, outptr, 0);
+
+                r0 += 4 * 2;
+                outptr += 4;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack4_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack4to1.h b/src/layer/loongarch/convolution_1x1_pack4to1.h
new file mode 100644
index 00000000000..b87129091e4
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack4to1.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
+
+static void conv1x1s2_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const float* r0 = bottom_blob.channel(p);
+        float* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _val = (__m128)__lsx_vld(r0, 0);
+                __lsx_vst(_val, outptr, 0);
+
+                r0 += 4 * 2;
+                outptr += 4;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack8to1_int8.h b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h
new file mode 100644
index 00000000000..8df0e128b7f
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const int64_t* r0 = bottom_blob.channel(p);
+        int64_t* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack8to4_int8.h b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h
new file mode 100644
index 00000000000..6aaa720d23d
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const int64_t* r0 = bottom_blob.channel(p);
+        int64_t* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3.h b/src/layer/loongarch/convolution_3x3.h
new file mode 100644
index 00000000000..66e10106b46
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3.h
@@ -0,0 +1,412 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd23_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt)
+{
+    Mat kernel_tm(4 * 4, inch, outch);
+
+    // G
+    const float ktm[4][3] = {
+        {1.0f, 0.0f, 0.0f},
+        {1.0f / 2, 1.0f / 2, 1.0f / 2},
+        {1.0f / 2, -1.0f / 2, 1.0f / 2},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[4][3];
+            for (int i = 0; i < 4; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 4; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 4; i++)
+                {
+                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 16-inch-outch
+    // dst = inch-16-outch
+#if __loongarch_sx
+    kernel_tm2.create(8 * inch, 16, outch / 8 + (outch % 8) / 4 + outch % 4);
+#else
+    kernel_tm2.create(2 * inch, 16, outch / 2 + outch % 2);
+#endif
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 7 < outch; q += 8)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; q + 1 < outch; q += 2)
+    {
+        Mat g0 = kernel_tm2.channel(q / 2);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 2; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+#if __loongarch_sx
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4);
+#else
+        Mat g0 = kernel_tm2.channel(q / 2 + q % 2);
+#endif
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                const float* k00 = kernel_tm.channel(q).row(p);
+                g00[0] = k00[k];
+                g00++;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 2n+2, winograd F(2,3)
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
+
+    w = outw + 2;
+    h = outh + 2;
+    Option opt_b = opt;
+    opt_b.blob_allocator = opt.workspace_allocator;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 2;
+        int h_tiles = outh / 2;
+        int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 16, inch, 4u, opt.workspace_allocator);
+        conv3x3s1_winograd23_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd23_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
+
+static void conv3x3s1_winograd43_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt)
+{
+    Mat kernel_tm(6 * 6, inch, outch);
+
+    // G
+    const float ktm[6][3] = {
+        {1.0f / 4, 0.0f, 0.0f},
+        {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+        {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+        {1.0f / 24, 1.0f / 12, 1.0f / 6},
+        {1.0f / 24, -1.0f / 12, 1.0f / 6},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = inch-36-outch
+#if __loongarch_sx
+    kernel_tm2.create(8 * inch, 36, outch / 8 + (outch % 8) / 4 + outch % 4);
+#else
+    kernel_tm2.create(2 * inch, 36, outch / 2 + outch % 2);
+#endif
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 7 < outch; q += 8)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; q + 1 < outch; q += 2)
+    {
+        Mat g0 = kernel_tm2.channel(q / 2);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 2; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+#if __loongarch_sx
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4);
+#else
+        Mat g0 = kernel_tm2.channel(q / 2 + q % 2);
+#endif
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                const float* k00 = kernel_tm.channel(q).row(p);
+                g00[0] = k00[k];
+                g00++;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2, winograd F(4,3)
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+
+    Option opt_b = opt;
+    opt_b.blob_allocator = opt.workspace_allocator;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 4u, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_int8.h b/src/layer/loongarch/convolution_3x3_int8.h
new file mode 100644
index 00000000000..3ea28dd0944
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_int8.h
@@ -0,0 +1,252 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_kernel_int8_lsx(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 2b-inch-36-outch/2b
+#if __loongarch_sx
+    if (outch >= 4)
+    {
+        if (inch >= 4)
+            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch / 4 + outch % 4, (size_t)2u * 16, 16);
+        else
+            kernel_tm_packed.create(inch, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
+    }
+#else  // __loongarch_sx
+    if (outch >= 2)
+    {
+        kernel_tm_packed.create(inch, 36, outch / 2 + outch % 2, (size_t)2u * 2, 2);
+    }
+#endif // __loongarch_sx
+    else
+    {
+#if __loongarch_sx
+        if (inch >= 4)
+            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch, (size_t)2u * 4, 4);
+        else
+#endif // __loongarch_sx
+        {
+            kernel_tm_packed.create(inch, 36, outch, (size_t)2u, 1);
+        }
+    }
+
+    int p = 0;
+#if __loongarch_sx
+    for (; p + 3 < outch; p += 4)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+        const Mat k1 = kernel_tm.channel(p + 1);
+        const Mat k2 = kernel_tm.channel(p + 2);
+        const Mat k3 = kernel_tm.channel(p + 3);
+
+        Mat g0 = kernel_tm_packed.channel(p / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k0.row<const short>(q + 1)[k];
+                g00[2] = k0.row<const short>(q + 2)[k];
+                g00[3] = k0.row<const short>(q + 3)[k];
+                g00[4] = k1.row<const short>(q)[k];
+                g00[5] = k1.row<const short>(q + 1)[k];
+                g00[6] = k1.row<const short>(q + 2)[k];
+                g00[7] = k1.row<const short>(q + 3)[k];
+                g00[8] = k2.row<const short>(q)[k];
+                g00[9] = k2.row<const short>(q + 1)[k];
+                g00[10] = k2.row<const short>(q + 2)[k];
+                g00[11] = k2.row<const short>(q + 3)[k];
+                g00[12] = k3.row<const short>(q)[k];
+                g00[13] = k3.row<const short>(q + 1)[k];
+                g00[14] = k3.row<const short>(q + 2)[k];
+                g00[15] = k3.row<const short>(q + 3)[k];
+                g00 += 16;
+            }
+            for (; q < inch; q++)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k1.row<const short>(q)[k];
+                g00[2] = k2.row<const short>(q)[k];
+                g00[3] = k3.row<const short>(q)[k];
+                g00 += 4;
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; p + 1 < outch; p += 2)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+        const Mat k1 = kernel_tm.channel(p + 1);
+
+        Mat g0 = kernel_tm_packed.channel(p / 2);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            int q = 0;
+            for (; q < inch; q++)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k1.row<const short>(q)[k];
+                g00 += 2;
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; p < outch; p++)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+
+#if __loongarch_sx
+        Mat g0 = kernel_tm_packed.channel(p / 4 + p % 4);
+#else
+        Mat g0 = kernel_tm_packed.channel(p / 2 + p % 2);
+#endif
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            int q = 0;
+#if __loongarch_sx
+            for (; q + 3 < inch; q += 4)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k0.row<const short>(q + 1)[k];
+                g00[2] = k0.row<const short>(q + 2)[k];
+                g00[3] = k0.row<const short>(q + 3)[k];
+                g00 += 4;
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00 += 1;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack1to4.h b/src/layer/loongarch/convolution_3x3_pack1to4.h
new file mode 100644
index 00000000000..2bcb0ce166d
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack1to4.h
@@ -0,0 +1,812 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int inch = bottom_blob.c;
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+        out0.fill(_bias0);
+
+        const float* k0 = kernel.channel(p);
+
+        int q = 0;
+        for (; q < inch; q++)
+        {
+            float* outptr0 = out0;
+
+            const Mat img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0.row(0);
+            const float* r1 = img0.row(1);
+            const float* r2 = img0.row(2);
+
+            __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+            __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+            __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+            __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+            __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+            __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+            __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+            __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+            __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+            int i = 0;
+            for (; i < outh; i++)
+            {
+                int j = 0;
+                for (; j + 7 < outw; j += 8)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+                    __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0);
+                    __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0);
+                    __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0);
+                    __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
+                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k00, _r04, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k00, _r05, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k00, _r06, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k00, _r07, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k01, _r05, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k01, _r06, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k01, _r07, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k01, _r08, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k02, _r06, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k02, _r07, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k02, _r08, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k02, _r09, _sum7);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
+                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k10, _r14, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k10, _r15, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k10, _r16, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k10, _r17, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k11, _r15, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k11, _r16, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k11, _r17, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k11, _r18, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k12, _r16, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k12, _r17, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k12, _r18, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k12, _r19, _sum7);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
+                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k20, _r24, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k20, _r25, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k20, _r26, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k20, _r27, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k21, _r25, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k21, _r26, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k21, _r27, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k21, _r28, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k22, _r26, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k22, _r27, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k22, _r28, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k22, _r29, _sum7);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+                    __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+                    __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+                    __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+                    __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+
+                    outptr0 += 4 * 8;
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                }
+                for (; j + 3 < outw; j += 4)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+                    outptr0 += 4 * 4;
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                }
+                for (; j + 1 < outw; j += 2)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+
+                    outptr0 += 4 * 2;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                }
+                for (; j < outw; j++)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+
+                    outptr0 += 4;
+
+                    r0 += 1;
+                    r1 += 1;
+                    r2 += 1;
+                }
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+            }
+
+            k0 += 9 * 4;
+        }
+    }
+}
+
+static void conv3x3s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+        out0.fill(_bias0);
+
+        const float* k0 = kernel.channel(p);
+
+        int q = 0;
+        for (; q < inch; q++)
+        {
+            float* outptr0 = out0;
+
+            const Mat img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0.row(0);
+            const float* r1 = img0.row(1);
+            const float* r2 = img0.row(2);
+
+            __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+            __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+            __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+            __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+            __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+            __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+            __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+            __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+            __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+            int i = 0;
+            for (; i < outh; i++)
+            {
+                int j = 0;
+                for (; j + 7 < outw; j += 8)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+                    __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0);
+                    __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0);
+                    __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0);
+                    __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);
+                    __m128i _r0nnn = __lsx_vld(r0 + 12, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
+                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
+                    __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2);
+                    __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3);
+                    __m128 _r0c = (__m128)__lsx_vreplvei_w(_r0nnn, 0);
+                    __m128 _r0d = (__m128)__lsx_vreplvei_w(_r0nnn, 1);
+                    __m128 _r0e = (__m128)__lsx_vreplvei_w(_r0nnn, 2);
+                    __m128 _r0f = (__m128)__lsx_vreplvei_w(_r0nnn, 3);
+                    __m128 _r0g = __lsx_vreplfr2vr_s(r0[16]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k00, _r08, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k00, _r0a, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k00, _r0c, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k00, _r0e, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k01, _r09, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k01, _r0b, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k01, _r0d, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k01, _r0f, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k02, _r0a, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k02, _r0c, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k02, _r0e, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k02, _r0g, _sum7);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);
+                    __m128i _r1nnn = __lsx_vld(r1 + 12, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
+                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
+                    __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2);
+                    __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3);
+                    __m128 _r1c = (__m128)__lsx_vreplvei_w(_r1nnn, 0);
+                    __m128 _r1d = (__m128)__lsx_vreplvei_w(_r1nnn, 1);
+                    __m128 _r1e = (__m128)__lsx_vreplvei_w(_r1nnn, 2);
+                    __m128 _r1f = (__m128)__lsx_vreplvei_w(_r1nnn, 3);
+                    __m128 _r1g = __lsx_vreplfr2vr_s(r1[16]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k10, _r18, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k10, _r1a, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k10, _r1c, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k10, _r1e, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k11, _r19, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k11, _r1b, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k11, _r1d, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k11, _r1f, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k12, _r1a, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k12, _r1c, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k12, _r1e, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k12, _r1g, _sum7);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);
+                    __m128i _r2nnn = __lsx_vld(r2 + 12, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
+                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
+                    __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2);
+                    __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3);
+                    __m128 _r2c = (__m128)__lsx_vreplvei_w(_r2nnn, 0);
+                    __m128 _r2d = (__m128)__lsx_vreplvei_w(_r2nnn, 1);
+                    __m128 _r2e = (__m128)__lsx_vreplvei_w(_r2nnn, 2);
+                    __m128 _r2f = (__m128)__lsx_vreplvei_w(_r2nnn, 3);
+                    __m128 _r2g = __lsx_vreplfr2vr_s(r2[16]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k20, _r28, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k20, _r2a, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k20, _r2c, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k20, _r2e, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k21, _r29, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k21, _r2b, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k21, _r2d, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k21, _r2f, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k22, _r2a, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k22, _r2c, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k22, _r2e, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k22, _r2g, _sum7);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+                    __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+                    __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+                    __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+                    __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+
+                    outptr0 += 4 * 8;
+
+                    r0 += 16;
+                    r1 += 16;
+                    r2 += 16;
+                }
+                for (; j + 3 < outw; j += 4)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = __lsx_vreplfr2vr_s(r0[8]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = __lsx_vreplfr2vr_s(r1[8]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = __lsx_vreplfr2vr_s(r2[8]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+                    outptr0 += 4 * 4;
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                }
+                for (; j + 1 < outw; j += 2)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = __lsx_vreplfr2vr_s(r0[4]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = __lsx_vreplfr2vr_s(r1[4]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = __lsx_vreplfr2vr_s(r2[4]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+
+                    outptr0 += 4 * 2;
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                }
+                for (; j < outw; j++)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+
+                    outptr0 += 4;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+            }
+
+            k0 += 9 * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack4.h b/src/layer/loongarch/convolution_3x3_pack4.h
new file mode 100644
index 00000000000..f06bb7e9068
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack4.h
@@ -0,0 +1,425 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd63_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
+{
+    // winograd63 transform kernel
+    Mat kernel_tm;
+    kernel_tm.create(8 * 8, inch, outch);
+
+    const float ktm[8][3] = {
+        {1.0f, 0.0f, 0.0f},
+        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+        {1.0f / 90, 1.0f / 45, 2.0f / 45},
+        {1.0f / 90, -1.0f / 45, 2.0f / 45},
+        {1.0f / 45, 1.0f / 90, 1.0f / 180},
+        {1.0f / 45, -1.0f / 90, 1.0f / 180},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel, transposed
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[8][3];
+            for (int i = 0; i < 8; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // v
+            for (int j = 0; j < 8; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++)
+                {
+                    kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 64-inch-outch
+    // dst = pb-pa-inch/pa-64-outch/pb
+    kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 4 * 4, 4 * 4);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm_pack4.channel(q / 4);
+
+        for (int k = 0; k < 64; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p + 3 < inch; p += 4)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd63_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 6n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 5) / 6 * 6;
+    outh = (outh + 5) / 6 * 6;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 6;
+        int h_tiles = outh / 6;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd63_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd63_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
+
+static void conv3x3s1_winograd43_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch);
+
+    const float ktm[6][3] = {
+        {1.0f / 4, 0.0f, 0.0f},
+        {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+        {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+        {1.0f / 24, 1.0f / 12, 1.0f / 6},
+        {1.0f / 24, -1.0f / 12, 1.0f / 6},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = pb-pa-inch/pa-36-outch/pb
+    kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 4 * 4, 4 * 4);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm_pack4.channel(q / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p + 3 < inch; p += 4)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
+
+static void conv3x3s1_winograd23_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
+{
+    // winograd23 transform kernel
+    Mat kernel_tm(4 * 4, inch, outch);
+
+    const float ktm[4][3] = {
+        {1.0f, 0.0f, 0.0f},
+        {1.0f / 2, 1.0f / 2, 1.0f / 2},
+        {1.0f / 2, -1.0f / 2, 1.0f / 2},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[4][3];
+            for (int i = 0; i < 4; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 4; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 4; i++)
+                {
+                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 16-inch-outch
+    // dst = pb-pa-inch/pa-16-outch/pb
+    kernel_tm_pack4.create(inch / 4, 16, outch / 4, (size_t)4u * 4 * 4, 4 * 4);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm_pack4.channel(q / 4);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p + 3 < inch; p += 4)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 2n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 2;
+        int h_tiles = outh / 2;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd23_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd23_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack8to1_int8.h b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h
new file mode 100644
index 00000000000..3c4f9718753
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h
@@ -0,0 +1,177 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 4b-8a-inch/8a-36-outch/4b
+    kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
+
+    int p = 0;
+    for (; p + 3 < outch; p += 4)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+        const Mat k1 = kernel_tm.channel(p + 1);
+        const Mat k2 = kernel_tm.channel(p + 2);
+        const Mat k3 = kernel_tm.channel(p + 3);
+
+        Mat g0 = kernel_tm_pack8to1.channel(p / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            for (int q = 0; q + 7 < inch; q += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    g00[0] = k0.row<const short>(q + i)[k];
+                    g00[1] = k1.row<const short>(q + i)[k];
+                    g00[2] = k2.row<const short>(q + i)[k];
+                    g00[3] = k3.row<const short>(q + i)[k];
+
+                    g00 += 4;
+                }
+            }
+        }
+    }
+    for (; p < outch; p++)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+
+        Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            for (int q = 0; q + 7 < inch; q += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    g00[0] = k0.row<const short>(q + i)[k];
+
+                    g00 += 1;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack8to1_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack8to4_int8.h b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h
new file mode 100644
index 00000000000..bf328cee73f
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h
@@ -0,0 +1,161 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 4b-8a-inch/8a-36-outch/4b
+    kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        const Mat k0 = kernel_tm.channel(q);
+        const Mat k1 = kernel_tm.channel(q + 1);
+        const Mat k2 = kernel_tm.channel(q + 2);
+        const Mat k3 = kernel_tm.channel(q + 3);
+
+        Mat kernel_tm = kernel_tm_pack8.channel(q / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = kernel_tm.row<short>(k);
+
+            for (int p = 0; p + 7 < inch; p += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    const short* k00 = k0.row<const short>(p + i);
+                    const short* k10 = k1.row<const short>(p + i);
+                    const short* k20 = k2.row<const short>(p + i);
+                    const short* k30 = k3.row<const short>(p + i);
+
+                    g00[0] = k00[k];
+                    g00[1] = k10[k];
+                    g00[2] = k20[k];
+                    g00[3] = k30[k];
+
+                    g00 += 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack8to4_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_pack4_int8_lsx(top_blob_tm, top_blob_bordered, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_7x7_pack1to4.h b/src/layer/loongarch/convolution_7x7_pack1to4.h
new file mode 100644
index 00000000000..f57923b53d0
--- /dev/null
+++ b/src/layer/loongarch/convolution_7x7_pack1to4.h
@@ -0,0 +1,652 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv7x7s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+        out0.fill(_bias0);
+
+        for (int q = 0; q < inch; q++)
+        {
+            float* outptr0 = out0;
+
+            const Mat img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0.row(0);
+            const float* r1 = img0.row(1);
+            const float* r2 = img0.row(2);
+            const float* r3 = img0.row(3);
+            const float* r4 = img0.row(4);
+            const float* r5 = img0.row(5);
+            const float* r6 = img0.row(6);
+
+            const float* kptr = kernel.channel(p).row(q);
+
+            int i = 0;
+
+            for (; i < outh; i++)
+            {
+                int j = 0;
+                for (; j + 3 < outw; j += 4)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+
+                    __m128 _k00 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
+                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
+                    __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2);
+                    __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3);
+                    __m128 _r0c = __lsx_vreplfr2vr_s(r0[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k03, _r03, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k03, _r05, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k03, _r07, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k03, _r09, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k04, _r04, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k04, _r06, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k04, _r08, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k04, _r0a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k05, _r05, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k05, _r07, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k05, _r09, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k05, _r0b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k06, _r06, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k06, _r08, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k06, _r0a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k06, _r0c, _sum3);
+
+                    __m128 _k10 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
+                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
+                    __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2);
+                    __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3);
+                    __m128 _r1c = __lsx_vreplfr2vr_s(r1[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k13, _r13, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k13, _r15, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k13, _r17, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k13, _r19, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k14, _r14, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k14, _r16, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k14, _r18, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k14, _r1a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k15, _r15, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k15, _r17, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k15, _r19, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k15, _r1b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k16, _r16, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k16, _r18, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k16, _r1a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k16, _r1c, _sum3);
+
+                    __m128 _k20 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
+                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
+                    __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2);
+                    __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3);
+                    __m128 _r2c = __lsx_vreplfr2vr_s(r2[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k23, _r23, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k23, _r25, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k23, _r27, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k23, _r29, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k24, _r24, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k24, _r26, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k24, _r28, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k24, _r2a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k25, _r25, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k25, _r27, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k25, _r29, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k25, _r2b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k26, _r26, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k26, _r28, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k26, _r2a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k26, _r2c, _sum3);
+
+                    __m128 _k30 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r3 = __lsx_vld(r3, 0);
+                    __m128i _r3n = __lsx_vld(r3 + 4, 0);
+                    __m128i _r3nn = __lsx_vld(r3 + 8, 0);
+
+                    __m128 _r30 = (__m128)__lsx_vreplvei_w(_r3, 0);
+                    __m128 _r31 = (__m128)__lsx_vreplvei_w(_r3, 1);
+                    __m128 _r32 = (__m128)__lsx_vreplvei_w(_r3, 2);
+                    __m128 _r33 = (__m128)__lsx_vreplvei_w(_r3, 3);
+                    __m128 _r34 = (__m128)__lsx_vreplvei_w(_r3n, 0);
+                    __m128 _r35 = (__m128)__lsx_vreplvei_w(_r3n, 1);
+                    __m128 _r36 = (__m128)__lsx_vreplvei_w(_r3n, 2);
+                    __m128 _r37 = (__m128)__lsx_vreplvei_w(_r3n, 3);
+                    __m128 _r38 = (__m128)__lsx_vreplvei_w(_r3nn, 0);
+                    __m128 _r39 = (__m128)__lsx_vreplvei_w(_r3nn, 1);
+                    __m128 _r3a = (__m128)__lsx_vreplvei_w(_r3nn, 2);
+                    __m128 _r3b = (__m128)__lsx_vreplvei_w(_r3nn, 3);
+                    __m128 _r3c = __lsx_vreplfr2vr_s(r3[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k30, _r30, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k30, _r32, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k30, _r34, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k30, _r36, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k31, _r31, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k31, _r33, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k31, _r35, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k31, _r37, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k32, _r32, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k32, _r34, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k32, _r36, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k32, _r38, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k33, _r33, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k33, _r35, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k33, _r37, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k33, _r39, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k34, _r34, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k34, _r36, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k34, _r38, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k34, _r3a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k35, _r35, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k35, _r37, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k35, _r39, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k35, _r3b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k36, _r36, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k36, _r38, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k36, _r3a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k36, _r3c, _sum3);
+
+                    __m128 _k40 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r4 = __lsx_vld(r4, 0);
+                    __m128i _r4n = __lsx_vld(r4 + 4, 0);
+                    __m128i _r4nn = __lsx_vld(r4 + 8, 0);
+
+                    __m128 _r40 = (__m128)__lsx_vreplvei_w(_r4, 0);
+                    __m128 _r41 = (__m128)__lsx_vreplvei_w(_r4, 1);
+                    __m128 _r42 = (__m128)__lsx_vreplvei_w(_r4, 2);
+                    __m128 _r43 = (__m128)__lsx_vreplvei_w(_r4, 3);
+                    __m128 _r44 = (__m128)__lsx_vreplvei_w(_r4n, 0);
+                    __m128 _r45 = (__m128)__lsx_vreplvei_w(_r4n, 1);
+                    __m128 _r46 = (__m128)__lsx_vreplvei_w(_r4n, 2);
+                    __m128 _r47 = (__m128)__lsx_vreplvei_w(_r4n, 3);
+                    __m128 _r48 = (__m128)__lsx_vreplvei_w(_r4nn, 0);
+                    __m128 _r49 = (__m128)__lsx_vreplvei_w(_r4nn, 1);
+                    __m128 _r4a = (__m128)__lsx_vreplvei_w(_r4nn, 2);
+                    __m128 _r4b = (__m128)__lsx_vreplvei_w(_r4nn, 3);
+                    __m128 _r4c = __lsx_vreplfr2vr_s(r4[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k40, _r40, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k40, _r42, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k40, _r44, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k40, _r46, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k41, _r41, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k41, _r43, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k41, _r45, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k41, _r47, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k42, _r42, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k42, _r44, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k42, _r46, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k42, _r48, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k43, _r43, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k43, _r45, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k43, _r47, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k43, _r49, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k44, _r44, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k44, _r46, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k44, _r48, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k44, _r4a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k45, _r45, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k45, _r47, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k45, _r49, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k45, _r4b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k46, _r46, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k46, _r48, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k46, _r4a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k46, _r4c, _sum3);
+
+                    __m128 _k50 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r5 = __lsx_vld(r5, 0);
+                    __m128i _r5n = __lsx_vld(r5 + 4, 0);
+                    __m128i _r5nn = __lsx_vld(r5 + 8, 0);
+
+                    __m128 _r50 = (__m128)__lsx_vreplvei_w(_r5, 0);
+                    __m128 _r51 = (__m128)__lsx_vreplvei_w(_r5, 1);
+                    __m128 _r52 = (__m128)__lsx_vreplvei_w(_r5, 2);
+                    __m128 _r53 = (__m128)__lsx_vreplvei_w(_r5, 3);
+                    __m128 _r54 = (__m128)__lsx_vreplvei_w(_r5n, 0);
+                    __m128 _r55 = (__m128)__lsx_vreplvei_w(_r5n, 1);
+                    __m128 _r56 = (__m128)__lsx_vreplvei_w(_r5n, 2);
+                    __m128 _r57 = (__m128)__lsx_vreplvei_w(_r5n, 3);
+                    __m128 _r58 = (__m128)__lsx_vreplvei_w(_r5nn, 0);
+                    __m128 _r59 = (__m128)__lsx_vreplvei_w(_r5nn, 1);
+                    __m128 _r5a = (__m128)__lsx_vreplvei_w(_r5nn, 2);
+                    __m128 _r5b = (__m128)__lsx_vreplvei_w(_r5nn, 3);
+                    __m128 _r5c = __lsx_vreplfr2vr_s(r5[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k50, _r50, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k50, _r52, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k50, _r54, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k50, _r56, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k51, _r51, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k51, _r53, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k51, _r55, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k51, _r57, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k52, _r52, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k52, _r54, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k52, _r56, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k52, _r58, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k53, _r53, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k53, _r55, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k53, _r57, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k53, _r59, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k54, _r54, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k54, _r56, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k54, _r58, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k54, _r5a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k55, _r55, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k55, _r57, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k55, _r59, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k55, _r5b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k56, _r56, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k56, _r58, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k56, _r5a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k56, _r5c, _sum3);
+
+                    __m128 _k60 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr -= 4 * 42;
+
+                    __m128i _r6 = __lsx_vld(r6, 0);
+                    __m128i _r6n = __lsx_vld(r6 + 4, 0);
+                    __m128i _r6nn = __lsx_vld(r6 + 8, 0);
+
+                    __m128 _r60 = (__m128)__lsx_vreplvei_w(_r6, 0);
+                    __m128 _r61 = (__m128)__lsx_vreplvei_w(_r6, 1);
+                    __m128 _r62 = (__m128)__lsx_vreplvei_w(_r6, 2);
+                    __m128 _r63 = (__m128)__lsx_vreplvei_w(_r6, 3);
+                    __m128 _r64 = (__m128)__lsx_vreplvei_w(_r6n, 0);
+                    __m128 _r65 = (__m128)__lsx_vreplvei_w(_r6n, 1);
+                    __m128 _r66 = (__m128)__lsx_vreplvei_w(_r6n, 2);
+                    __m128 _r67 = (__m128)__lsx_vreplvei_w(_r6n, 3);
+                    __m128 _r68 = (__m128)__lsx_vreplvei_w(_r6nn, 0);
+                    __m128 _r69 = (__m128)__lsx_vreplvei_w(_r6nn, 1);
+                    __m128 _r6a = (__m128)__lsx_vreplvei_w(_r6nn, 2);
+                    __m128 _r6b = (__m128)__lsx_vreplvei_w(_r6nn, 3);
+                    __m128 _r6c = __lsx_vreplfr2vr_s(r6[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k60, _r60, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k60, _r62, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k60, _r64, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k60, _r66, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k61, _r61, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k61, _r63, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k61, _r65, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k61, _r67, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k62, _r62, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k62, _r64, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k62, _r66, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k62, _r68, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k63, _r63, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k63, _r65, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k63, _r67, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k63, _r69, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k64, _r64, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k64, _r66, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k64, _r68, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k64, _r6a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k65, _r65, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k65, _r67, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k65, _r69, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k65, _r6b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k66, _r66, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k66, _r68, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k66, _r6a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k66, _r6c, _sum3);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+                    outptr0 += 4 * 4;
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                    r3 += 8;
+                    r4 += 8;
+                    r5 += 8;
+                    r6 += 8;
+                }
+                for (; j < outw; j++)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+
+                    __m128 _k00 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, (__m128)__lsx_vreplvei_w(_r0, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k01, (__m128)__lsx_vreplvei_w(_r0, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k02, (__m128)__lsx_vreplvei_w(_r0, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k03, (__m128)__lsx_vreplvei_w(_r0, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k04, (__m128)__lsx_vreplvei_w(_r0n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k05, (__m128)__lsx_vreplvei_w(_r0n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k06, (__m128)__lsx_vreplvei_w(_r0n, 2), _sum0);
+
+                    __m128 _k10 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, (__m128)__lsx_vreplvei_w(_r1, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k11, (__m128)__lsx_vreplvei_w(_r1, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k12, (__m128)__lsx_vreplvei_w(_r1, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k13, (__m128)__lsx_vreplvei_w(_r1, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k14, (__m128)__lsx_vreplvei_w(_r1n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k15, (__m128)__lsx_vreplvei_w(_r1n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k16, (__m128)__lsx_vreplvei_w(_r1n, 2), _sum0);
+
+                    __m128 _k20 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, (__m128)__lsx_vreplvei_w(_r2, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k21, (__m128)__lsx_vreplvei_w(_r2, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k22, (__m128)__lsx_vreplvei_w(_r2, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k23, (__m128)__lsx_vreplvei_w(_r2, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k24, (__m128)__lsx_vreplvei_w(_r2n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k25, (__m128)__lsx_vreplvei_w(_r2n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k26, (__m128)__lsx_vreplvei_w(_r2n, 2), _sum0);
+
+                    __m128 _k30 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r3 = __lsx_vld(r3, 0);
+                    __m128i _r3n = __lsx_vld(r3 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k30, (__m128)__lsx_vreplvei_w(_r3, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k31, (__m128)__lsx_vreplvei_w(_r3, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k32, (__m128)__lsx_vreplvei_w(_r3, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k33, (__m128)__lsx_vreplvei_w(_r3, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k34, (__m128)__lsx_vreplvei_w(_r3n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k35, (__m128)__lsx_vreplvei_w(_r3n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k36, (__m128)__lsx_vreplvei_w(_r3n, 2), _sum0);
+
+                    __m128 _k40 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r4 = __lsx_vld(r4, 0);
+                    __m128i _r4n = __lsx_vld(r4 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k40, (__m128)__lsx_vreplvei_w(_r4, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k41, (__m128)__lsx_vreplvei_w(_r4, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k42, (__m128)__lsx_vreplvei_w(_r4, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k43, (__m128)__lsx_vreplvei_w(_r4, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k44, (__m128)__lsx_vreplvei_w(_r4n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k45, (__m128)__lsx_vreplvei_w(_r4n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k46, (__m128)__lsx_vreplvei_w(_r4n, 2), _sum0);
+
+                    __m128 _k50 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r5 = __lsx_vld(r5, 0);
+                    __m128i _r5n = __lsx_vld(r5 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k50, (__m128)__lsx_vreplvei_w(_r5, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k51, (__m128)__lsx_vreplvei_w(_r5, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k52, (__m128)__lsx_vreplvei_w(_r5, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k53, (__m128)__lsx_vreplvei_w(_r5, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k54, (__m128)__lsx_vreplvei_w(_r5n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k55, (__m128)__lsx_vreplvei_w(_r5n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k56, (__m128)__lsx_vreplvei_w(_r5n, 2), _sum0);
+
+                    __m128 _k60 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr -= 4 * 42;
+
+                    __m128i _r6 = __lsx_vld(r6, 0);
+                    __m128i _r6n = __lsx_vld(r6 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k60, (__m128)__lsx_vreplvei_w(_r6, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k61, (__m128)__lsx_vreplvei_w(_r6, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k62, (__m128)__lsx_vreplvei_w(_r6, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k63, (__m128)__lsx_vreplvei_w(_r6, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k64, (__m128)__lsx_vreplvei_w(_r6n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k65, (__m128)__lsx_vreplvei_w(_r6n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k66, (__m128)__lsx_vreplvei_w(_r6n, 2), _sum0);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+
+                    outptr0 += 4;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    r4 += 2;
+                    r5 += 2;
+                    r6 += 2;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+                r4 += tailstep;
+                r5 += tailstep;
+                r6 += tailstep;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_int8.h b/src/layer/loongarch/convolution_int8.h
new file mode 100644
index 00000000000..22c7a8ccbe6
--- /dev/null
+++ b/src/layer/loongarch/convolution_int8.h
@@ -0,0 +1,82 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                int sum = 0;
+
+                //                 const signed char* kptr = weight_data_int8.channel(p);
+                const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        signed char val = sptr[space_ofs[k]];
+                        signed char w = kptr[k];
+                        sum += val * w;
+                    }
+
+                    kptr += maxk;
+                }
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp
new file mode 100644
index 00000000000..31719b3de92
--- /dev/null
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -0,0 +1,975 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_loongarch.h"
+
+#include "benchmark.h"
+#include "cpu.h"
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+#include "cpu.h"
+
+namespace ncnn {
+
+#include "convolution_sgemm.h"
+#include "convolution_winograd_transform.h"
+#include "convolution_winograd_dot.h"
+#include "convolution_1x1.h"
+#include "convolution_3x3.h"
+
+#if NCNN_INT8
+#include "convolution_sgemm_int8.h"
+#include "convolution_winograd_transform_int8.h"
+#include "convolution_winograd_dot_int8.h"
+#include "convolution_1x1_int8.h"
+#include "convolution_3x3_int8.h"
+#include "convolution_int8.h"
+#endif // NCNN_INT8
+
+#if __loongarch_sx
+#include "convolution_pack4.h"
+#include "convolution_pack1to4.h"
+#include "convolution_pack4to1.h"
+
+#include "convolution_sgemm_pack4.h"
+#include "convolution_sgemm_pack4to1.h"
+#include "convolution_winograd_transform_pack4.h"
+#include "convolution_winograd_dot_pack4.h"
+#include "convolution_1x1_pack4.h"
+#include "convolution_1x1_pack4to1.h"
+#include "convolution_3x3_pack4.h"
+#include "convolution_3x3_pack1to4.h"
+#include "convolution_7x7_pack1to4.h"
+
+#if NCNN_INT8
+#include "convolution_pack8to4_int8.h"
+#include "convolution_pack1to4_int8.h"
+#include "convolution_pack8to1_int8.h"
+#include "convolution_sgemm_pack8to4_int8.h"
+#include "convolution_sgemm_pack1to4_int8.h"
+#include "convolution_sgemm_pack8to1_int8.h"
+#include "convolution_winograd_transform_pack4_int8.h"
+#include "convolution_winograd_transform_pack8_int8.h"
+#include "convolution_winograd_dot_pack8to4_int8.h"
+#include "convolution_winograd_dot_pack8to1_int8.h"
+#include "convolution_1x1_pack8to4_int8.h"
+#include "convolution_1x1_pack1to4_int8.h"
+#include "convolution_1x1_pack8to1_int8.h"
+#include "convolution_3x3_pack8to4_int8.h"
+#include "convolution_3x3_pack8to1_int8.h"
+#endif // NCNN_INT8
+#endif // __loongarch_sx
+
+Convolution_loongarch::Convolution_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+
+    activation = 0;
+}
+
+static void convolution_transform_kernel_packed_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-kw-kh-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_tm.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < elempack; i++)
+                    {
+                        for (int j = 0; j < out_elempack; j++)
+                        {
+                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+int Convolution_loongarch::create_pipeline(const Option& opt)
+{
+    if (dynamic_weight)
+        return 0;
+
+    activation = create_activation_layer(activation_type, activation_params, opt);
+
+#if NCNN_INT8
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8_loongarch(opt);
+    }
+#endif
+
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+#if __loongarch_sx
+    // pack4
+    if (elempack == 4 && out_elempack == 4)
+    {
+        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd63_transform_kernel_pack4_lsx(weight_data, weight_winograd63_data, num_input, num_output, opt);
+            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd43_transform_kernel_pack4_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+            else // if (opt.use_winograd23_convolution)
+                conv3x3s1_winograd23_transform_kernel_pack4_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack1ton
+    if (elempack == 1 && out_elempack == 4)
+    {
+        convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+    }
+
+    // pack4to1
+    if (elempack == 4 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+#endif // __loongarch_sx
+
+    // pack1
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd43_transform_kernel_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+            }
+            else if (opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd23_transform_kernel_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            weight_data_tm = weight_data;
+        }
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::destroy_pipeline(const Option& opt)
+{
+    if (activation)
+    {
+        activation->destroy_pipeline(opt);
+        delete activation;
+        activation = 0;
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if NCNN_INT8
+    if (opt.use_int8_inference && int8_scale_term)
+    {
+        return forward_int8_loongarch(bottom_blob, top_blob, opt);
+    }
+#endif
+
+    // flattened blob, implement as InnerProduct
+    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+    {
+        Mat bottom_blob_3d;
+        if (bottom_blob.elemsize % 16 == 0)
+        {
+            bottom_blob_3d = bottom_blob;
+            bottom_blob_3d.dims = 3;
+            bottom_blob_3d.w = 1;
+            bottom_blob_3d.h = 1;
+            bottom_blob_3d.c = bottom_blob.w;
+            bottom_blob_3d.cstep = 1;
+        }
+        else
+        {
+            bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
+        }
+
+        Mat top_blob_3d;
+        int ret = forward(bottom_blob_3d, top_blob_3d, opt);
+        if (ret != 0)
+            return ret;
+
+        if (top_blob_3d.elemsize % 16 == 0)
+        {
+            top_blob = top_blob_3d;
+            top_blob.dims = 1;
+            top_blob.w = top_blob_3d.c;
+            top_blob.h = 1;
+            top_blob.c = 1;
+            bottom_blob_3d.cstep = top_blob_3d.c;
+        }
+        else
+        {
+            top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
+        }
+
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int num_input = channels * elempack;
+
+#if __loongarch_sx
+    if (elempack == 4 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd63_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
+            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd43_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
+            else // if (opt.use_winograd23_convolution)
+                conv3x3s1_winograd23_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            convolution_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv3x3s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv7x7s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            convolution_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            convolution_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd43_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
+            }
+            else if (opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd23_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
+            }
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            const int maxk = kernel_w * kernel_h;
+
+            // kernel offsets
+            std::vector<int> _space_ofs(maxk);
+            int* space_ofs = &_space_ofs[0];
+            {
+                int p1 = 0;
+                int p2 = 0;
+                int gap = w * dilation_h - kernel_w * dilation_w;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        space_ofs[p1] = p2;
+                        p1++;
+                        p2 += dilation_w;
+                    }
+                    p2 += gap;
+                }
+            }
+
+            // num_output
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < num_output; p++)
+            {
+                float* outptr = top_blob.channel(p);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        float sum = 0.f;
+
+                        if (bias_term)
+                        {
+                            sum = bias_data[p];
+                        }
+
+                        const float* kptr = (const float*)weight_data_tm + maxk * channels * p;
+
+                        // channels
+                        for (int q = 0; q < channels; q++)
+                        {
+                            const Mat m = bottom_blob_bordered.channel(q);
+                            const float* sptr = m.row(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                float val = sptr[space_ofs[k]];
+                                float wt = kptr[k];
+                                sum += val * wt;
+                            }
+
+                            kptr += maxk;
+                        }
+
+                        sum = activation_ss(sum, activation_type, activation_params);
+
+                        outptr[j] = sum;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.c * _weight_data.elempack;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(21, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(31, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, pad_value);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_flattened.w);
+    pd.set(8, int8_scale_term);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_flattened;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_INT8
+static void convolution_transform_kernel_packed_int8_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pa-pb-kw-kh-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g00 = weight_data_tm.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < out_elempack; i++)
+                    {
+                        for (int j = 0; j < elempack; j++)
+                        {
+                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 8 == 0 ? 8 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+    if (elempack == 8 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 8 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_transform_kernel_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            weight_data_tm = weight_data;
+        }
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // requantize and relu
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    int w = bottom_blob_bordered.w;
+    int h = bottom_blob_bordered.h;
+    int channels = bottom_blob_bordered.c;
+    int elempack = bottom_blob_bordered.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        else
+            out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int num_input = channels * elempack;
+
+    int out_elempack_int32 = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+    Mat top_blob_int32;
+    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
+    if (top_blob_int32.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (elempack == 8 && out_elempack_int32 == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack_int32 == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+
+    if (elempack == 8 && out_elempack_int32 == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack_int32 == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+
+    if (use_int8_requantize)
+    {
+        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+    }
+    else
+    {
+        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+        if (activation)
+        {
+            activation->forward_inplace(top_blob, opt);
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/convolution_loongarch.h b/src/layer/loongarch/convolution_loongarch.h
new file mode 100644
index 00000000000..a84281bf713
--- /dev/null
+++ b/src/layer/loongarch/convolution_loongarch.h
@@ -0,0 +1,56 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION_LOONGARCH_H
+#define LAYER_CONVOLUTION_LOONGARCH_H
+
+#include "convolution.h"
+
+namespace ncnn {
+
+class Convolution_loongarch : virtual public Convolution
+{
+public:
+    Convolution_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8_loongarch(const Option& opt);
+    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
+public:
+    Layer* activation;
+
+    Mat weight_data_tm;
+    Mat weight_sgemm_data;
+    Mat weight_winograd23_data;
+    Mat weight_winograd43_data;
+    Mat weight_winograd63_data;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION_LOONGARCH_H
diff --git a/src/layer/loongarch/convolution_pack1to4.h b/src/layer/loongarch/convolution_pack1to4.h
new file mode 100644
index 00000000000..b7e0123d5ed
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack1to4.h
@@ -0,0 +1,90 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const float* sptr = m.row(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++) // 29.23
+                    {
+                        __m128 _val = __lsx_vreplfr2vr_s(sptr[space_ofs[k]]);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                        kptr += 4;
+                    }
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack1to4_int8.h b/src/layer/loongarch/convolution_pack1to4_int8.h
new file mode 100644
index 00000000000..b043503c2ac
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack1to4_int8.h
@@ -0,0 +1,87 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128i _val = __lsx_vreplgr2vr_h((short)sptr[space_ofs[k]]);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                        __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                        _sum = __lsx_vadd_w(_sum, _s032);
+
+                        kptr += 4;
+                    }
+                }
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack4.h b/src/layer/loongarch/convolution_pack4.h
new file mode 100644
index 00000000000..66a7863f015
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack4.h
@@ -0,0 +1,102 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    const float* bias_data_ptr = bias_data;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack4.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                    for (int k = 0; k < maxk; k++) // 29.23
+                    {
+                        const float* slptr = sptr + space_ofs[k] * 4;
+
+                        __m128 _val0 = __lsx_vreplfr2vr_s(slptr[0]);
+                        __m128 _val1 = __lsx_vreplfr2vr_s(slptr[1]);
+                        __m128 _val2 = __lsx_vreplfr2vr_s(slptr[2]);
+                        __m128 _val3 = __lsx_vreplfr2vr_s(slptr[3]);
+
+                        __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                        __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                        __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                        __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+
+                        _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+                        _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
+                        _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
+                        _sum = __lsx_vfmadd_s(_w3, _val3, _sum);
+
+                        kptr += 16;
+                    }
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack4to1.h b/src/layer/loongarch/convolution_pack4to1.h
new file mode 100644
index 00000000000..872759fc7f1
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack4to1.h
@@ -0,0 +1,94 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = 0.f;
+
+                if (bias_data_ptr)
+                {
+                    sum = bias_data_ptr[p];
+                }
+
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                const float* kptr = (const float*)weight_data_pack4to1.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                        kptr += 4;
+                    }
+                }
+
+                sum += __lsx_reduce_fadd_s(_sum);
+
+                sum = activation_ss(sum, activation_type, activation_params);
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack8to1_int8.h b/src/layer/loongarch/convolution_pack8to1_int8.h
new file mode 100644
index 00000000000..c7463a472b6
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack8to1_int8.h
@@ -0,0 +1,87 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
+                        __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+
+                        _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0));
+
+                        kptr += 8;
+                    }
+                }
+
+                outptr[j] = __lsx_reduce_add_w(_sum);
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack8to4_int8.h b/src/layer/loongarch/convolution_pack8to4_int8.h
new file mode 100644
index 00000000000..00d90387bbe
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack8to4_int8.h
@@ -0,0 +1,120 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
+                        __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                        __m128i _w01 = __lsx_vld(kptr, 0);
+                        __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                        __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                        __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                        __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                        __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                        __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                        __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                        __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                        __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+                        __m128i _s2 = __lsx_vmul_h(_val16, _w2);
+                        __m128i _s3 = __lsx_vmul_h(_val16, _w3);
+
+                        _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                        _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+                        _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
+                        _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));
+
+                        kptr += 32;
+                    }
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum2);
+
+                __lsx_vst(_sum0, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_sgemm.h b/src/layer/loongarch/convolution_sgemm.h
new file mode 100644
index 00000000000..7b74ceac14b
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm.h
@@ -0,0 +1,650 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    // permute
+    Mat tmp;
+    if (size >= 4)
+        tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u, 1, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 4u, 1, opt.workspace_allocator);
+    {
+        int nn_size = size / 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = ii * 4;
+
+            float* tmpptr = tmp.channel(i / 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+#if __loongarch_sx
+                    __lsx_vst(__lsx_vld(img0, 0), tmpptr, 0);
+#else
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+                    tmpptr[2] = img0[2];
+                    tmpptr[3] = img0[3];
+#endif
+                    img0 += size;
+                    tmpptr += 4;
+                }
+            }
+        }
+
+        int remain_size_start = nn_size * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            float* tmpptr = tmp.channel(i / 4 + i % 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    img0 += size;
+                    tmpptr += 1;
+                }
+            }
+        }
+    }
+
+#if __loongarch_sx
+    int nn_outch = outch >> 3;
+    int remain_outch_start = nn_outch << 3;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 8;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+        float* outptr2 = top_blob.channel(p + 2);
+        float* outptr3 = top_blob.channel(p + 3);
+        float* outptr4 = top_blob.channel(p + 4);
+        float* outptr5 = top_blob.channel(p + 5);
+        float* outptr6 = top_blob.channel(p + 6);
+        float* outptr7 = top_blob.channel(p + 7);
+
+        const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+            const float* kptr = kernel.channel(p / 8);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]);
+            __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]);
+            __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]);
+            __m128 _sum4 = __lsx_vreplfr2vr_s(biasptr[4]);
+            __m128 _sum5 = __lsx_vreplfr2vr_s(biasptr[5]);
+            __m128 _sum6 = __lsx_vreplfr2vr_s(biasptr[6]);
+            __m128 _sum7 = __lsx_vreplfr2vr_s(biasptr[7]);
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 32);
+                __m128 _val = (__m128)__lsx_vld(tmpptr, 0);
+                __m128i _w0123 = __lsx_vld(kptr, 0);
+                __m128i _w4567 = __lsx_vld(kptr + 4, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4);
+                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5);
+                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6);
+                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7);
+
+                tmpptr += 4;
+                kptr += 8;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr1, 0);
+            __lsx_vst(_sum2, outptr2, 0);
+            __lsx_vst(_sum3, outptr3, 0);
+            __lsx_vst(_sum4, outptr4, 0);
+            __lsx_vst(_sum5, outptr5, 0);
+            __lsx_vst(_sum6, outptr6, 0);
+            __lsx_vst(_sum7, outptr7, 0);
+
+            outptr0 += 4;
+            outptr1 += 4;
+            outptr2 += 4;
+            outptr3 += 4;
+            outptr4 += 4;
+            outptr5 += 4;
+            outptr6 += 4;
+            outptr7 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+            const float* kptr = kernel.channel(p / 8);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = biasptr[0];
+            float sum1 = biasptr[1];
+            float sum2 = biasptr[2];
+            float sum3 = biasptr[3];
+            float sum4 = biasptr[4];
+            float sum5 = biasptr[5];
+            float sum6 = biasptr[6];
+            float sum7 = biasptr[7];
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                sum2 += tmpptr[0] * kptr[2];
+                sum3 += tmpptr[0] * kptr[3];
+                sum4 += tmpptr[0] * kptr[4];
+                sum5 += tmpptr[0] * kptr[5];
+                sum6 += tmpptr[0] * kptr[6];
+                sum7 += tmpptr[0] * kptr[7];
+                tmpptr++;
+                kptr += 8;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr2[0] = sum2;
+            outptr3[0] = sum3;
+            outptr4[0] = sum4;
+            outptr5[0] = sum5;
+            outptr6[0] = sum6;
+            outptr7[0] = sum7;
+
+            outptr0++;
+            outptr1++;
+            outptr2++;
+            outptr3++;
+            outptr4++;
+            outptr5++;
+            outptr6++;
+            outptr7++;
+        }
+    }
+
+    nn_outch = (outch - remain_outch_start) >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = remain_outch_start + pp * 4;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+        float* outptr2 = top_blob.channel(p + 2);
+        float* outptr3 = top_blob.channel(p + 3);
+
+        const float zeros[4] = {0.f, 0.f, 0.f, 0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]);
+            __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]);
+            __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]);
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 16);
+                __m128 _val = (__m128)__lsx_vld(tmpptr, 0);
+                __m128i _w0123 = __lsx_vld(kptr, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+
+                tmpptr += 4;
+                kptr += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr1, 0);
+            __lsx_vst(_sum2, outptr2, 0);
+            __lsx_vst(_sum3, outptr3, 0);
+
+            outptr0 += 4;
+            outptr1 += 4;
+            outptr2 += 4;
+            outptr3 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = biasptr[0];
+            float sum1 = biasptr[1];
+            float sum2 = biasptr[2];
+            float sum3 = biasptr[3];
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                sum2 += tmpptr[0] * kptr[2];
+                sum3 += tmpptr[0] * kptr[3];
+                tmpptr++;
+                kptr += 4;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr2[0] = sum2;
+            outptr3[0] = sum3;
+
+            outptr0++;
+            outptr1++;
+            outptr2++;
+            outptr3++;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+#else // __loongarch_sx
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+
+        const float zeros[2] = {0.f, 0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+            const float* kptr = kernel.channel(p / 2);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum00 = biasptr[0];
+            float sum01 = biasptr[0];
+            float sum02 = biasptr[0];
+            float sum03 = biasptr[0];
+            float sum10 = biasptr[1];
+            float sum11 = biasptr[1];
+            float sum12 = biasptr[1];
+            float sum13 = biasptr[1];
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 8);
+                float k0 = kptr[0];
+                float k1 = kptr[1];
+                sum00 += tmpptr[0] * k0;
+                sum01 += tmpptr[1] * k0;
+                sum02 += tmpptr[2] * k0;
+                sum03 += tmpptr[3] * k0;
+                sum10 += tmpptr[0] * k1;
+                sum11 += tmpptr[1] * k1;
+                sum12 += tmpptr[2] * k1;
+                sum13 += tmpptr[3] * k1;
+                tmpptr += 4;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum00;
+            outptr0[1] = sum01;
+            outptr0[2] = sum02;
+            outptr0[3] = sum03;
+            outptr1[0] = sum10;
+            outptr1[1] = sum11;
+            outptr1[2] = sum12;
+            outptr1[3] = sum13;
+
+            outptr0 += 4;
+            outptr1 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+            const float* kptr = kernel.channel(p / 2);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = biasptr[0];
+            float sum1 = biasptr[1];
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 4);
+                __builtin_prefetch(kptr + 8);
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                tmpptr++;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+
+            outptr0++;
+            outptr1++;
+        }
+    }
+#endif // __loongarch_sx
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        float* outptr0 = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+#if __loongarch_sx
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+            const float* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int nn = inch * maxk; // inch always > 0
+
+#if __loongarch_sx
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int q = 0; q < nn; q++)
+            {
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(tmpptr, 0), __lsx_vreplfr2vr_s(kptr[0]), _sum0);
+                tmpptr += 4;
+                kptr++;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+
+            outptr0 += 4;
+#else
+            float sum0 = bias0;
+            float sum1 = bias0;
+            float sum2 = bias0;
+            float sum3 = bias0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 4);
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[1] * kptr[0];
+                sum2 += tmpptr[2] * kptr[0];
+                sum3 += tmpptr[3] * kptr[0];
+                tmpptr += 4;
+                kptr++;
+            }
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0[2] = sum2;
+            outptr0[3] = sum3;
+
+            outptr0 += 4;
+#endif // __loongarch_sx
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+#if __loongarch_sx
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+            const float* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = bias0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                tmpptr++;
+                kptr++;
+            }
+
+            outptr0[0] = sum0;
+
+            outptr0++;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8b-maxk-inch-outch/8b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+#if __loongarch_sx
+    kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4);
+#else
+    kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2);
+#endif
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 7 < outch; q += 8)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+        const Mat k2 = kernel.channel(q + 2);
+        const Mat k3 = kernel.channel(q + 3);
+        const Mat k4 = kernel.channel(q + 4);
+        const Mat k5 = kernel.channel(q + 5);
+        const Mat k6 = kernel.channel(q + 6);
+        const Mat k7 = kernel.channel(q + 7);
+
+        float* g00 = kernel_tm.channel(q / 8);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+            const float* k10 = k1.row(p);
+            const float* k20 = k2.row(p);
+            const float* k30 = k3.row(p);
+            const float* k40 = k4.row(p);
+            const float* k50 = k5.row(p);
+            const float* k60 = k6.row(p);
+            const float* k70 = k7.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+                g00[1] = k10[k];
+                g00[2] = k20[k];
+                g00[3] = k30[k];
+                g00[4] = k40[k];
+                g00[5] = k50[k];
+                g00[6] = k60[k];
+                g00[7] = k70[k];
+
+                g00 += 8;
+            }
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+        const Mat k2 = kernel.channel(q + 2);
+        const Mat k3 = kernel.channel(q + 3);
+
+        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+            const float* k10 = k1.row(p);
+            const float* k20 = k2.row(p);
+            const float* k30 = k3.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+                g00[1] = k10[k];
+                g00[2] = k20[k];
+                g00[3] = k30[k];
+
+                g00 += 4;
+            }
+        }
+    }
+#else
+    for (; q + 1 < outch; q += 2)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+
+        float* g00 = kernel_tm.channel(q / 2);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+            const float* k10 = k1.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+                g00[1] = k10[k];
+
+                g00 += 2;
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+        const Mat k0 = kernel.channel(q);
+
+#if __loongarch_sx
+        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4);
+#else
+        float* g00 = kernel_tm.channel(q / 2 + q % 2);
+#endif
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+
+                g00 += 1;
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            float* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const float* sptr = img.row<const float>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_int8.h b/src/layer/loongarch/convolution_sgemm_int8.h
new file mode 100644
index 00000000000..98f47760901
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_int8.h
@@ -0,0 +1,800 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+#if __loongarch_sx
+    if (inch >= 4)
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
+    }
+    else
+#endif // __loongarch_sx
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    }
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            signed char* tmpptr = tmp.channel(i / 2);
+
+            int q = 0;
+#if __loongarch_sx
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr[4] = img0[1];
+                    tmpptr[5] = img1[1];
+                    tmpptr[6] = img2[1];
+                    tmpptr[7] = img3[1];
+                    tmpptr += 8;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+
+                    tmpptr += 2;
+
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            int q = 0;
+#if __loongarch_sx
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr += 4;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+
+                    tmpptr += 1;
+
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+#if __loongarch_sx
+    int nn_outch = outch >> 2;
+    int remain_outch_start = nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
+                    __m128i _val1 = __lsx_vilvh_d(_val01, _val01);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                    __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                    __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                    __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+
+                    __m128i _exts00 = __lsx_vslti_h(_s00, 0);
+                    __m128i _exts01 = __lsx_vslti_h(_s01, 0);
+                    __m128i _exts10 = __lsx_vslti_h(_s10, 0);
+                    __m128i _exts11 = __lsx_vslti_h(_s11, 0);
+                    __m128i _s00l = __lsx_vilvl_h(_exts00, _s00);
+                    __m128i _s00h = __lsx_vilvh_h(_exts00, _s00);
+                    __m128i _s01l = __lsx_vilvl_h(_exts01, _s01);
+                    __m128i _s01h = __lsx_vilvh_h(_exts01, _s01);
+                    __m128i _s10l = __lsx_vilvl_h(_exts10, _s10);
+                    __m128i _s10h = __lsx_vilvh_h(_exts10, _s10);
+                    __m128i _s11l = __lsx_vilvl_h(_exts11, _s11);
+                    __m128i _s11h = __lsx_vilvh_h(_exts11, _s11);
+
+                    _sum00 = __lsx_vadd_w(_sum00, _s00l);
+                    _sum01 = __lsx_vadd_w(_sum01, _s00h);
+                    _sum02 = __lsx_vadd_w(_sum02, _s01l);
+                    _sum03 = __lsx_vadd_w(_sum03, _s01h);
+                    _sum10 = __lsx_vadd_w(_sum10, _s10l);
+                    _sum11 = __lsx_vadd_w(_sum11, _s10h);
+                    _sum12 = __lsx_vadd_w(_sum12, _s11l);
+                    _sum13 = __lsx_vadd_w(_sum13, _s11h);
+
+                    tmpptr += 8;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                    _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                    _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                    _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                    _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                    _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                    _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                    _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                    _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum01);
+                _sum02 = __lsx_vadd_w(_sum02, _sum03);
+                _sum10 = __lsx_vadd_w(_sum10, _sum11);
+                _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum02);
+                _sum10 = __lsx_vadd_w(_sum10, _sum12);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]);
+                __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]);
+                __m128i _val = __lsx_vilvl_d(_val1, _val0);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                _sum00 = __lsx_vadd_w(_sum00, _s0l);
+                _sum10 = __lsx_vadd_w(_sum10, _s0h);
+
+                tmpptr += 2;
+                kptr += 4;
+            }
+
+            int sum[8];
+            __lsx_vst(_sum00, sum, 0);
+            __lsx_vst(_sum10, sum + 4, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0[1] = sum[4];
+            outptr1[1] = sum[5];
+            outptr2[1] = sum[6];
+            outptr3[1] = sum[7];
+            outptr0 += 2;
+            outptr1 += 2;
+            outptr2 += 2;
+            outptr3 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    _val16 = __lsx_vilvl_d(_val16, _val16);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                    __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+
+                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                    __m128i _exts1 = __lsx_vslti_h(_s1, 0);
+                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+                    __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
+                    __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);
+
+                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
+                    _sum2 = __lsx_vadd_w(_sum2, _s1l);
+                    _sum3 = __lsx_vadd_w(_sum3, _s1h);
+
+                    tmpptr += 4;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+                _sum0 = __lsx_vadd_w(_sum0, _sum2);
+            }
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                _sum0 = __lsx_vadd_w(_sum0, _s032);
+
+                tmpptr += 1;
+                kptr += 4;
+            }
+
+            int sum[4];
+            __lsx_vst(_sum0, sum, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+#else // __loongarch_sx
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 2);
+
+            int sum00 = 0;
+            int sum01 = 0;
+            int sum10 = 0;
+            int sum11 = 0;
+
+            int nn1 = inch * maxk;
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val0 = tmpptr[0];
+                signed char val1 = tmpptr[1];
+                signed char w0 = kptr[0];
+                signed char w1 = kptr[1];
+
+                sum00 += val0 * w0;
+                sum01 += val1 * w0;
+                sum10 += val0 * w1;
+                sum11 += val1 * w1;
+
+                tmpptr += 2;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum00;
+            outptr0[1] = sum01;
+            outptr1[0] = sum10;
+            outptr1[1] = sum11;
+            outptr0 += 2;
+            outptr1 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 2);
+
+            int sum00 = 0;
+            int sum10 = 0;
+
+            int nn1 = inch * maxk;
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val0 = tmpptr[0];
+                signed char w0 = kptr[0];
+                signed char w1 = kptr[1];
+
+                sum00 += val0 * w0;
+                sum10 += val0 * w1;
+
+                tmpptr += 1;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum00;
+            outptr1[0] = sum10;
+            outptr0 += 1;
+            outptr1 += 1;
+        }
+    }
+#endif // __loongarch_sx
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+#if __loongarch_sx
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+#else
+            const signed char* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int sum0 = 0;
+            int sum1 = 0;
+
+#if __loongarch_sx
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            if (nn4 > 0)
+            {
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _w = __lsx_vld(kptr, 0);
+                    __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                    _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
+
+                    tmpptr += 8;
+                    kptr += 4;
+                }
+
+                sum0 = __lsx_reduce_add_w(_sum0);
+                sum1 = __lsx_reduce_add_w(_sum1);
+            }
+#else
+            int nn1 = inch * maxk;
+#endif // __loongarch_sx
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val0 = tmpptr[0];
+                signed char val1 = tmpptr[1];
+                signed char w = kptr[0];
+
+                sum0 += val0 * w;
+                sum1 += val1 * w;
+
+                tmpptr += 2;
+                kptr += 1;
+            }
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+#if __loongarch_sx
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+#else
+            const signed char* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int sum = 0;
+
+#if __loongarch_sx
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            if (nn4 > 0)
+            {
+                __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _w = __lsx_vld(kptr, 0);
+                    __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+                    __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                    _sum = __lsx_vadd_w(_sum, _s032);
+
+                    tmpptr += 4;
+                    kptr += 4;
+                }
+
+                sum = __lsx_reduce_add_w(_sum);
+            }
+#else
+            int nn1 = inch * maxk;
+#endif // __loongarch_sx
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val = tmpptr[0];
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                tmpptr += 1;
+                kptr += 1;
+            }
+
+            outptr0[0] = sum;
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 4a-4b-maxk-inch/4a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+#if __loongarch_sx
+    if (outch >= 4)
+    {
+        if (inch >= 4)
+            kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u);
+        else
+            kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u);
+    }
+#else
+    if (outch >= 2)
+    {
+        kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)1u);
+    }
+#endif // __loongarch_sx
+    else
+    {
+#if __loongarch_sx
+        if (inch >= 4)
+            kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u);
+        else
+#endif // __loongarch_sx
+        {
+            kernel_tm.create(1 * maxk, inch, outch, (size_t)1u);
+        }
+    }
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        int p = 0;
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; q + 1 < outch; q += 2)
+    {
+        signed char* g00 = kernel_tm.channel(q / 2);
+
+        int p = 0;
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 2; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+#if __loongarch_sx
+        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);
+#else
+        signed char* g00 = kernel_tm.channel(q / 2 + q % 2);
+#endif
+
+        int p = 0;
+#if __loongarch_sx
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+#endif // __loongarch_sx
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k00 = kernel.channel(q).row<const signed char>(p);
+                g00[0] = k00[k];
+                g00++;
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            signed char* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j + 3 < outw; j += 4)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+                            ptr[2] = sptr[stride_w * 2];
+                            ptr[3] = sptr[stride_w * 3];
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+                        for (; j + 1 < outw; j += 2)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+
+                            sptr += stride_w * 2;
+                            ptr += 2;
+                        }
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h
new file mode 100644
index 00000000000..3429bfae5fa
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h
@@ -0,0 +1,481 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (inch >= 4)
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
+    }
+    else
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    }
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            signed char* tmpptr = tmp.channel(i / 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr[4] = img0[1];
+                    tmpptr[5] = img1[1];
+                    tmpptr[6] = img2[1];
+                    tmpptr[7] = img3[1];
+                    tmpptr += 8;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+
+                    tmpptr += 2;
+
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr += 4;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+
+                    tmpptr += 1;
+
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __builtin_prefetch(tmpptr + 32);
+                    __builtin_prefetch(kptr + 64);
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
+                    __m128i _val1 = __lsx_vilvh_d(_val01, _val01);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                    __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                    __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                    __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+
+                    __m128i _exts00 = __lsx_vslti_h(_s00, 0);
+                    __m128i _exts01 = __lsx_vslti_h(_s01, 0);
+                    __m128i _exts10 = __lsx_vslti_h(_s10, 0);
+                    __m128i _exts11 = __lsx_vslti_h(_s11, 0);
+                    __m128i _s00l = __lsx_vilvl_h(_exts00, _s00);
+                    __m128i _s00h = __lsx_vilvh_h(_exts00, _s00);
+                    __m128i _s01l = __lsx_vilvl_h(_exts01, _s01);
+                    __m128i _s01h = __lsx_vilvh_h(_exts01, _s01);
+                    __m128i _s10l = __lsx_vilvl_h(_exts10, _s10);
+                    __m128i _s10h = __lsx_vilvh_h(_exts10, _s10);
+                    __m128i _s11l = __lsx_vilvl_h(_exts11, _s11);
+                    __m128i _s11h = __lsx_vilvh_h(_exts11, _s11);
+
+                    _sum00 = __lsx_vadd_w(_sum00, _s00l);
+                    _sum01 = __lsx_vadd_w(_sum01, _s00h);
+                    _sum02 = __lsx_vadd_w(_sum02, _s01l);
+                    _sum03 = __lsx_vadd_w(_sum03, _s01h);
+                    _sum10 = __lsx_vadd_w(_sum10, _s10l);
+                    _sum11 = __lsx_vadd_w(_sum11, _s10h);
+                    _sum12 = __lsx_vadd_w(_sum12, _s11l);
+                    _sum13 = __lsx_vadd_w(_sum13, _s11h);
+
+                    tmpptr += 8;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                    _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                    _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                    _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                    _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                    _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                    _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                    _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                    _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum01);
+                _sum02 = __lsx_vadd_w(_sum02, _sum03);
+                _sum10 = __lsx_vadd_w(_sum10, _sum11);
+                _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum02);
+                _sum10 = __lsx_vadd_w(_sum10, _sum12);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]);
+                __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]);
+                __m128i _val = __lsx_vilvl_d(_val1, _val0);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                _sum00 = __lsx_vadd_w(_sum00, _s0l);
+                _sum10 = __lsx_vadd_w(_sum10, _s0h);
+
+                tmpptr += 2;
+                kptr += 4;
+            }
+
+            __lsx_vst(_sum00, outptr0, 0);
+            __lsx_vst(_sum10, outptr0 + 4, 0);
+            outptr0 += 8;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __builtin_prefetch(tmpptr + 16);
+                    __builtin_prefetch(kptr + 64);
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    _val16 = __lsx_vilvl_d(_val16, _val16);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                    __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+
+                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                    __m128i _exts1 = __lsx_vslti_h(_s1, 0);
+                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+                    __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
+                    __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);
+
+                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
+                    _sum2 = __lsx_vadd_w(_sum2, _s1l);
+                    _sum3 = __lsx_vadd_w(_sum3, _s1h);
+
+                    tmpptr += 4;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+                _sum0 = __lsx_vadd_w(_sum0, _sum2);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                _sum0 = __lsx_vadd_w(_sum0, _s032);
+
+                tmpptr += 1;
+                kptr += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 4a-4b-maxk-inch/4a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    if (inch >= 4)
+        kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u);
+    else
+        kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        int p = 0;
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            signed char* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j + 3 < outw; j += 4)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+                            ptr[2] = sptr[stride_w * 2];
+                            ptr[3] = sptr[stride_w * 3];
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+                        for (; j + 1 < outw; j += 2)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+
+                            sptr += stride_w * 2;
+                            ptr += 2;
+                        }
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack4.h b/src/layer/loongarch/convolution_sgemm_pack4.h
new file mode 100644
index 00000000000..e3e7279a5d2
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack4.h
@@ -0,0 +1,519 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack4_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    // permute
+    Mat tmp;
+    if (size >= 12)
+        tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + (size % 12 % 4) / 2 + size % 12 % 2, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 8)
+        tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 4)
+        tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size / 12;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 12;
+
+            float* tmpptr = tmp.channel(i / 12);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x12
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+                    __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0);
+                    __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0);
+                    __m128i _ra = __lsx_vld(img0 + 4 * 10, 0);
+                    __m128i _rb = __lsx_vld(img0 + 4 * 11, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
+                    __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
+                    __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
+                    __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+                    __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
+                    __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
+                    __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
+                    __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
+                    __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
+                    __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 48;
+                }
+            }
+        }
+
+        remain_size_start += nn_size * 12;
+        nn_size = (size - remain_size_start) >> 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 8;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x8
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 32;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 3;
+        nn_size = (size - remain_size_start) >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 4;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 16;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 2;
+        nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x2
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+
+                    __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0);
+
+                    __lsx_vst(_r01_0, tmpptr, 0);
+                    __lsx_vst(_r01_1, tmpptr + 4, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 8;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128i _val = __lsx_vld(img0, 0);
+                    __lsx_vst(_val, tmpptr, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 4;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 11 < size; i += 12)
+        {
+            const float* tmpptr = tmp.channel(i / 12);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+            __m128 _sum2 = _sum0;
+            __m128 _sum3 = _sum0;
+            __m128 _sum4 = _sum0;
+            __m128 _sum5 = _sum0;
+            __m128 _sum6 = _sum0;
+            __m128 _sum7 = _sum0;
+            __m128 _sum8 = _sum0;
+            __m128 _sum9 = _sum0;
+            __m128 _suma = _sum0;
+            __m128 _sumb = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 48);
+                __builtin_prefetch(kptr0 + 16);
+                __m128i _val0123 = __lsx_vld(tmpptr, 0);
+                __m128i _val4567 = __lsx_vld(tmpptr + 4, 0);
+                __m128i _val89ab = __lsx_vld(tmpptr + 8, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+                _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8);
+                _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9);
+                _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma);
+                _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb);
+
+                tmpptr += 12;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+            __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+            __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+            __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+            __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+            __lsx_vst(_sum8, outptr0 + 4 * 8, 0);
+            __lsx_vst(_sum9, outptr0 + 4 * 9, 0);
+            __lsx_vst(_suma, outptr0 + 4 * 10, 0);
+            __lsx_vst(_sumb, outptr0 + 4 * 11, 0);
+
+            outptr0 += 4 * 12;
+        }
+        for (; i + 7 < size; i += 8)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+            __m128 _sum2 = _sum0;
+            __m128 _sum3 = _sum0;
+            __m128 _sum4 = _sum0;
+            __m128 _sum5 = _sum0;
+            __m128 _sum6 = _sum0;
+            __m128 _sum7 = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr0 + 16);
+                __m128i _val0123 = __lsx_vld(tmpptr, 0);
+                __m128i _val4567 = __lsx_vld(tmpptr + 4, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+
+                tmpptr += 8;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+            __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+            __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+            __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+            __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+
+            outptr0 += 4 * 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+            __m128 _sum2 = _sum0;
+            __m128 _sum3 = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 16);
+                __m128i _val0123 = __lsx_vld(tmpptr, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+
+                tmpptr += 4;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+            outptr0 += 4 * 4;
+        }
+        for (; i + 1 < size; i += 2)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 8);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _val1 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1);
+
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+
+            outptr0 += 4 * 2;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 4);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum, outptr0, 0);
+
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+    {
+        const int gap = (w * stride_h - outw * stride_w) * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            float* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const float* sptr = img.row<const float>(dilation_h * u) + dilation_w * v * 4;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __lsx_vst(_val, ptr, 0);
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack4to1.h b/src/layer/loongarch/convolution_sgemm_pack4to1.h
new file mode 100644
index 00000000000..3748645b4d4
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack4to1.h
@@ -0,0 +1,667 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack4to1_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    Mat tmp;
+    if (size >= 12)
+        tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + size % 12 % 4, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 8)
+        tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 4)
+        tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size / 12;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 12;
+
+            float* tmpptr = tmp.channel(i / 12);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x12
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+                    __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0);
+                    __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0);
+                    __m128i _ra = __lsx_vld(img0 + 4 * 10, 0);
+                    __m128i _rb = __lsx_vld(img0 + 4 * 11, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
+                    __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
+                    __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
+                    __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+                    __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
+                    __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
+                    __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
+                    __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
+                    __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
+                    __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 48;
+                }
+            }
+        }
+
+        remain_size_start += nn_size * 12;
+        nn_size = (size - remain_size_start) >> 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 8;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x8
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 32;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 3;
+        nn_size = (size - remain_size_start) >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 4;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 16;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128 _val = (__m128)__lsx_vld(img0, 0);
+                    __lsx_vst(_val, tmpptr, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 4;
+                }
+            }
+        }
+    }
+
+    int nn_outch = outch / 4;
+    int remain_outch_start = nn_outch * 4;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+        float* outptr2 = top_blob.channel(p + 2);
+        float* outptr3 = top_blob.channel(p + 3);
+
+        const float zeros[4] = {0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 11 < size; i += 12)
+        {
+            const float* tmpptr = tmp.channel(i / 12);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128i _bias = __lsx_vld(biasptr, 0);
+            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum8 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum9 = (__m128)__lsx_vreplvei_w(_bias, 3);
+            __m128 _suma = (__m128)__lsx_vreplvei_w(_bias, 3);
+            __m128 _sumb = (__m128)__lsx_vreplvei_w(_bias, 3);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 48);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0);
+                __m128i _w0123 = __lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val2, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum3);
+                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum4);
+                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val2, _sum5);
+                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum6);
+                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum7);
+                _sum8 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val2, _sum8);
+                _sum9 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum9);
+                _suma = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _suma);
+                _sumb = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val2, _sumb);
+
+                tmpptr += 12;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 8, 0);
+            __lsx_vst(_sum3, outptr1, 0);
+            __lsx_vst(_sum4, outptr1 + 4, 0);
+            __lsx_vst(_sum5, outptr1 + 8, 0);
+            __lsx_vst(_sum6, outptr2, 0);
+            __lsx_vst(_sum7, outptr2 + 4, 0);
+            __lsx_vst(_sum8, outptr2 + 8, 0);
+            __lsx_vst(_sum9, outptr3, 0);
+            __lsx_vst(_suma, outptr3 + 4, 0);
+            __lsx_vst(_sumb, outptr3 + 8, 0);
+
+            outptr0 += 12;
+            outptr1 += 12;
+            outptr2 += 12;
+            outptr3 += 12;
+        }
+        for (; i + 7 < size; i += 8)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128i _bias = __lsx_vld(biasptr, 0);
+            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 3);
+            __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 3);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128i _w0123 = __lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum3);
+                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum4);
+                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum5);
+                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum6);
+                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _sum7);
+
+                tmpptr += 8;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr1, 0);
+            __lsx_vst(_sum3, outptr1 + 4, 0);
+            __lsx_vst(_sum4, outptr2, 0);
+            __lsx_vst(_sum5, outptr2 + 4, 0);
+            __lsx_vst(_sum6, outptr3, 0);
+            __lsx_vst(_sum7, outptr3 + 4, 0);
+
+            outptr0 += 8;
+            outptr1 += 8;
+            outptr2 += 8;
+            outptr3 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128i _bias = __lsx_vld(biasptr, 0);
+            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 3);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128i _w0123 = __lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum3);
+
+                tmpptr += 4;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr1, 0);
+            __lsx_vst(_sum2, outptr2, 0);
+            __lsx_vst(_sum3, outptr3, 0);
+
+            outptr0 += 4;
+            outptr1 += 4;
+            outptr2 += 4;
+            outptr3 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum = (__m128)__lsx_vld(biasptr, 0);
+            float* _sum_p = (float*)&_sum;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 4);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+
+                kptr0 += 4;
+            }
+
+            outptr0[0] = _sum_p[0];
+            outptr1[0] = _sum_p[1];
+            outptr2[0] = _sum_p[2];
+            outptr3[0] = _sum_p[3];
+
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        float* outptr0 = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        int i = 0;
+        for (; i + 11 < size; i += 12)
+        {
+            const float* tmpptr = tmp.channel(i / 12);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(bias0);
+            __m128 _sum2 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 48);
+                __builtin_prefetch(kptr0 + 4);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0);
+                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
+                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
+                _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1);
+                _sum2 = __lsx_vfmadd_s(_val2, _w0, _sum2);
+
+                tmpptr += 12;
+                kptr0 += 1;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 8, 0);
+
+            outptr0 += 12;
+        }
+        for (; i + 7 < size; i += 8)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr0 + 4);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
+                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
+                _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1);
+
+                tmpptr += 8;
+                kptr0 += 1;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+
+            outptr0 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 4);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
+                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
+
+                tmpptr += 4;
+                kptr0 += 1;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+
+            outptr0 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = bias0;
+
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
+                tmpptr += 4;
+                kptr0 += 4;
+            }
+
+            sum0 += __lsx_reduce_fadd_s(_sum0);
+
+            outptr0[0] = sum0;
+
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = pb-pa-maxk-inch/pa-outch/pb
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    kernel_tm.create(4 * 4 * maxk, inch / 4, outch / 4 + outch % 4);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        float* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel.channel(q + j).row(p + i);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+    for (; q < outch; q++)
+    {
+        const Mat k0 = kernel.channel(q);
+
+        float* g00 = kernel_tm.channel(q / 4 + q % 4);
+
+        for (int p = 0; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    const float* k00 = k0.row(p + j);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+    {
+        const int gap = (w * stride_h - outw * stride_w) * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            float* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __lsx_vst(_val, ptr, 0);
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h
new file mode 100644
index 00000000000..98d11a574b0
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h
@@ -0,0 +1,458 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            int64_t* tmpptr = tmp.channel(i / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128i _v = __lsx_vld(img0, 0);
+                    __lsx_vst(_v, tmpptr, 0);
+                    tmpptr += 2;
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr += 1;
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+
+    nn_outch = outch >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 64);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val01 = __lsx_vld(tmpptr, 0);
+                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
+                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
+                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                __m128i _s02 = __lsx_vmul_h(_val0, _w2);
+                __m128i _s03 = __lsx_vmul_h(_val0, _w3);
+                __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+                __m128i _s12 = __lsx_vmul_h(_val1, _w2);
+                __m128i _s13 = __lsx_vmul_h(_val1, _w3);
+
+                _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00));
+                _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01));
+                _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02));
+                _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03));
+                _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10));
+                _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11));
+                _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12));
+                _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13));
+
+                tmpptr += 16;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum01);
+            _sum02 = __lsx_vadd_w(_sum02, _sum03);
+            _sum10 = __lsx_vadd_w(_sum10, _sum11);
+            _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum02);
+            _sum10 = __lsx_vadd_w(_sum10, _sum12);
+
+            int sum[8];
+            __lsx_vst(_sum00, sum, 0);
+            __lsx_vst(_sum10, sum + 4, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0[1] = sum[4];
+            outptr1[1] = sum[5];
+            outptr2[1] = sum[6];
+            outptr3[1] = sum[7];
+            outptr0 += 2;
+            outptr1 += 2;
+            outptr2 += 2;
+            outptr3 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val = __lsx_vld(tmpptr, 0);
+                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+                __m128i _s2 = __lsx_vmul_h(_val16, _w2);
+                __m128i _s3 = __lsx_vmul_h(_val16, _w3);
+
+                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+                _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
+                _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));
+
+                tmpptr += 8;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum1);
+            _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum2);
+
+            int sum[4];
+            __lsx_vst(_sum0, sum, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 64);
+                __builtin_prefetch(kptr + 32);
+                __m128i _val01 = __lsx_vld(tmpptr, 0);
+                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
+                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
+                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val0, _w16);
+                __m128i _s1 = __lsx_vmul_h(_val1, _w16);
+
+                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+
+                tmpptr += 16;
+                kptr += 8;
+            }
+
+            outptr0[0] = __lsx_reduce_add_w(_sum0);
+            outptr0[1] = __lsx_reduce_add_w(_sum1);
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr + 32);
+                __m128i _val = __lsx_vld(tmpptr, 0);
+                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+
+                _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0));
+
+                tmpptr += 8;
+                kptr += 8;
+            }
+
+            outptr0[0] = __lsx_reduce_add_w(_sum);
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8a-4b-maxk-inch/8a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    if (outch >= 4)
+        kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u);
+    else
+        kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+    // TODO unroll 2
+    for (; q < outch; q++)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 8; j++)
+                {
+                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int64_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h
new file mode 100644
index 00000000000..ae9090c9560
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h
@@ -0,0 +1,324 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            int64_t* tmpptr = tmp.channel(i / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128i _v = __lsx_vld(img0, 0);
+                    __lsx_vst(_v, tmpptr, 0);
+                    tmpptr += 2;
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr += 1;
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 64);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val01 = __lsx_vld(tmpptr, 0);
+                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
+                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
+                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                __m128i _s02 = __lsx_vmul_h(_val0, _w2);
+                __m128i _s03 = __lsx_vmul_h(_val0, _w3);
+                __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+                __m128i _s12 = __lsx_vmul_h(_val1, _w2);
+                __m128i _s13 = __lsx_vmul_h(_val1, _w3);
+
+                _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00));
+                _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01));
+                _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02));
+                _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03));
+                _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10));
+                _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11));
+                _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12));
+                _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13));
+
+                tmpptr += 16;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum01);
+            _sum02 = __lsx_vadd_w(_sum02, _sum03);
+            _sum10 = __lsx_vadd_w(_sum10, _sum11);
+            _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum02);
+            _sum10 = __lsx_vadd_w(_sum10, _sum12);
+
+            __lsx_vst(_sum00, outptr0, 0);
+            __lsx_vst(_sum10, outptr0 + 4, 0);
+            outptr0 += 8;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val = __lsx_vld(tmpptr, 0);
+                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+                __m128i _s2 = __lsx_vmul_h(_val16, _w2);
+                __m128i _s3 = __lsx_vmul_h(_val16, _w3);
+
+                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+                _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
+                _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));
+
+                tmpptr += 8;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum1);
+            _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum2);
+
+            __lsx_vst(_sum0, outptr0, 0);
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8a-4b-maxk-inch/8a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int64_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot.h b/src/layer/loongarch/convolution_winograd_dot.h
new file mode 100644
index 00000000000..9dbbe495549
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot.h
@@ -0,0 +1,495 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 4)
+        bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, batch, 4u, opt.workspace_allocator);
+    else
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 3 < tiles; i += 4)
+        {
+            float* tmpptr = tm2.row(i / 4);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i);
+
+            for (int q = 0; q < inch; q++)
+            {
+#if __loongarch_sx
+                __lsx_vst(__lsx_vld(r0, 0), tmpptr, 0);
+#else
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r0[1];
+                tmpptr[2] = r0[2];
+                tmpptr[3] = r0[3];
+#endif
+
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 4;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            float* tmpptr = tm2.row(i / 4 + i % 4);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i);
+
+            for (int q = 0; q < inch; q++)
+            {
+                tmpptr[0] = r0[0];
+
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 1;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 4u, opt.workspace_allocator);
+
+#if __loongarch_sx
+    int nn_outch = outch >> 3;
+    int remain_outch_start = nn_outch << 3;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 8;
+
+        float* output0_tm = top_blob_tm.channel(p);
+        float* output1_tm = top_blob_tm.channel(p + 1);
+        float* output2_tm = top_blob_tm.channel(p + 2);
+        float* output3_tm = top_blob_tm.channel(p + 3);
+        float* output4_tm = top_blob_tm.channel(p + 4);
+        float* output5_tm = top_blob_tm.channel(p + 5);
+        float* output6_tm = top_blob_tm.channel(p + 6);
+        float* output7_tm = top_blob_tm.channel(p + 7);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 8);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 32);
+                    __m128 _val = (__m128)__lsx_vld(r0, 0);
+                    __m128i _w0123 = __lsx_vld(k0, 0);
+                    __m128i _w4567 = __lsx_vld(k0 + 4, 0);
+                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                    _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                    _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                    _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+                    _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4);
+                    _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5);
+                    _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6);
+                    _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7);
+
+                    r0 += 4;
+                    k0 += 8;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output1_tm, 0);
+                __lsx_vst(_sum2, output2_tm, 0);
+                __lsx_vst(_sum3, output3_tm, 0);
+                __lsx_vst(_sum4, output4_tm, 0);
+                __lsx_vst(_sum5, output5_tm, 0);
+                __lsx_vst(_sum6, output6_tm, 0);
+                __lsx_vst(_sum7, output7_tm, 0);
+
+                output0_tm += 4;
+                output1_tm += 4;
+                output2_tm += 4;
+                output3_tm += 4;
+                output4_tm += 4;
+                output5_tm += 4;
+                output6_tm += 4;
+                output7_tm += 4;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum0 = 0.f;
+                float sum1 = 0.f;
+                float sum2 = 0.f;
+                float sum3 = 0.f;
+                float sum4 = 0.f;
+                float sum5 = 0.f;
+                float sum6 = 0.f;
+                float sum7 = 0.f;
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    sum0 += r0[0] * k0[0];
+                    sum1 += r0[0] * k0[1];
+                    sum2 += r0[0] * k0[2];
+                    sum3 += r0[0] * k0[3];
+                    sum4 += r0[0] * k0[4];
+                    sum5 += r0[0] * k0[5];
+                    sum6 += r0[0] * k0[6];
+                    sum7 += r0[0] * k0[7];
+
+                    r0 += 1;
+                    k0 += 8;
+                }
+
+                output0_tm[0] = sum0;
+                output1_tm[0] = sum1;
+                output2_tm[0] = sum2;
+                output3_tm[0] = sum3;
+                output4_tm[0] = sum4;
+                output5_tm[0] = sum5;
+                output6_tm[0] = sum6;
+                output7_tm[0] = sum7;
+
+                output0_tm++;
+                output1_tm++;
+                output2_tm++;
+                output3_tm++;
+                output4_tm++;
+                output5_tm++;
+                output6_tm++;
+                output7_tm++;
+            }
+        }
+    }
+
+    nn_outch = (outch - remain_outch_start) >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = remain_outch_start + pp * 4;
+
+        float* output0_tm = top_blob_tm.channel(p);
+        float* output1_tm = top_blob_tm.channel(p + 1);
+        float* output2_tm = top_blob_tm.channel(p + 2);
+        float* output3_tm = top_blob_tm.channel(p + 3);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 16);
+                    __m128 _val = (__m128)__lsx_vld(r0, 0);
+                    __m128i _w0123 = __lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                    _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                    _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                    _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+
+                    r0 += 4;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output1_tm, 0);
+                __lsx_vst(_sum2, output2_tm, 0);
+                __lsx_vst(_sum3, output3_tm, 0);
+
+                output0_tm += 4;
+                output1_tm += 4;
+                output2_tm += 4;
+                output3_tm += 4;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum0 = 0.f;
+                float sum1 = 0.f;
+                float sum2 = 0.f;
+                float sum3 = 0.f;
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    sum0 += r0[0] * k0[0];
+                    sum1 += r0[0] * k0[1];
+                    sum2 += r0[0] * k0[2];
+                    sum3 += r0[0] * k0[3];
+
+                    r0 += 1;
+                    k0 += 4;
+                }
+
+                output0_tm[0] = sum0;
+                output1_tm[0] = sum1;
+                output2_tm[0] = sum2;
+                output3_tm[0] = sum3;
+
+                output0_tm++;
+                output1_tm++;
+                output2_tm++;
+                output3_tm++;
+            }
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+#else
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        float* output0_tm = top_blob_tm.channel(p);
+        float* output1_tm = top_blob_tm.channel(p + 1);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 2);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum00 = 0.f;
+                float sum01 = 0.f;
+                float sum02 = 0.f;
+                float sum03 = 0.f;
+                float sum10 = 0.f;
+                float sum11 = 0.f;
+                float sum12 = 0.f;
+                float sum13 = 0.f;
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 8);
+                    float w0 = k0[0];
+                    float w1 = k0[1];
+                    sum00 += r0[0] * w0;
+                    sum01 += r0[1] * w0;
+                    sum02 += r0[2] * w0;
+                    sum03 += r0[3] * w0;
+                    sum10 += r0[0] * w1;
+                    sum11 += r0[1] * w1;
+                    sum12 += r0[2] * w1;
+                    sum13 += r0[3] * w1;
+
+                    r0 += 4;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum00;
+                output0_tm[1] = sum01;
+                output0_tm[2] = sum02;
+                output0_tm[3] = sum03;
+                output1_tm[0] = sum10;
+                output1_tm[1] = sum11;
+                output1_tm[2] = sum12;
+                output1_tm[3] = sum13;
+
+                output0_tm += 4;
+                output1_tm += 4;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum00 = 0.f;
+                float sum10 = 0.f;
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 4);
+                    __builtin_prefetch(k0 + 8);
+                    float val0 = r0[0];
+                    sum00 += val0 * k0[0];
+                    sum10 += val0 * k0[1];
+
+                    r0 += 1;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum00;
+                output1_tm[0] = sum10;
+                output0_tm++;
+                output1_tm++;
+            }
+        }
+    }
+#endif
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        float* output0_tm = top_blob_tm.channel(p);
+
+#if __loongarch_sx
+        const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+        const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
+#endif
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                int j = 0;
+#if __loongarch_sx
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (; j < nn; j++)
+                {
+                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(r0, 0), __lsx_vreplfr2vr_s(k0[0]), _sum0);
+                    r0 += 4;
+                    k0++;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                output0_tm += 4;
+#else  // __loongarch_sx
+                float sum0 = 0.f;
+                float sum1 = 0.f;
+                float sum2 = 0.f;
+                float sum3 = 0.f;
+
+                for (; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 4);
+                    float w0 = k0[0];
+                    sum0 += r0[0] * w0;
+                    sum1 += r0[1] * w0;
+                    sum2 += r0[2] * w0;
+                    sum3 += r0[3] * w0;
+
+                    r0 += 4;
+                    k0++;
+                }
+
+                output0_tm[0] = sum0;
+                output0_tm[1] = sum1;
+                output0_tm[2] = sum2;
+                output0_tm[3] = sum3;
+                output0_tm += 4;
+#endif // __loongarch_sx
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum = 0.f;
+
+                for (int j = 0; j < nn; j++)
+                {
+                    float w0 = k0[0];
+                    float val0 = r0[0];
+                    sum += val0 * w0;
+
+                    r0 += 1;
+                    k0 += 1;
+                }
+
+                output0_tm[0] = sum;
+                output0_tm += 1;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_int8.h b/src/layer/loongarch/convolution_winograd_dot_int8.h
new file mode 100644
index 00000000000..2ae5ce4f55e
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_int8.h
@@ -0,0 +1,594 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u, 1, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+#if __loongarch_sx
+    if (inch >= 4)
+    {
+        if (tiles >= 2)
+            bottom_blob_tm2.create(inch / 4 + inch % 4, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
+        else // if (tiles >= 1)
+            bottom_blob_tm2.create(inch / 4 + inch % 4, tiles, batch, 8u, 4, opt.workspace_allocator);
+    }
+    else
+#endif // __loongarch_sx
+    {
+        if (tiles >= 2)
+            bottom_blob_tm2.create(inch, tiles / 2 + tiles % 2, batch, 4u, 2, opt.workspace_allocator);
+        else // if (tiles >= 1)
+            bottom_blob_tm2.create(inch, tiles, batch, 2u, 1, opt.workspace_allocator);
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 1 < tiles; i += 2)
+        {
+            short* tmpptr = tm2.row<short>(i / 2);
+
+            const short* r0 = (const short*)bottom_blob_tm + r * tiles + i;
+
+            int q = 0;
+#if __loongarch_sx
+            const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i;
+            const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i;
+            const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i;
+            for (; q + 3 < inch; q += 4)
+            {
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r1[0];
+                tmpptr[2] = r2[0];
+                tmpptr[3] = r3[0];
+                tmpptr[4] = r0[1];
+                tmpptr[5] = r1[1];
+                tmpptr[6] = r2[1];
+                tmpptr[7] = r3[1];
+                r0 += bottom_blob_tm.cstep * 4;
+                r1 += bottom_blob_tm.cstep * 4;
+                r2 += bottom_blob_tm.cstep * 4;
+                r3 += bottom_blob_tm.cstep * 4;
+                tmpptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r0[1];
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 2;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+            const short* r0 = (const short*)bottom_blob_tm + r * tiles + i;
+
+            int q = 0;
+#if __loongarch_sx
+            const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i;
+            const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i;
+            const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i;
+            for (; q + 3 < inch; q += 4)
+            {
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r1[0];
+                tmpptr[2] = r2[0];
+                tmpptr[3] = r3[0];
+                r0 += bottom_blob_tm.cstep * 4;
+                r1 += bottom_blob_tm.cstep * 4;
+                r2 += bottom_blob_tm.cstep * 4;
+                r3 += bottom_blob_tm.cstep * 4;
+                tmpptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                tmpptr[0] = r0[0];
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 1;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator);
+
+#if __loongarch_sx
+    int nn_outch = outch >> 2;
+    int remain_outch_start = nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* output0_tm = top_blob_tm.channel(p);
+        int* output1_tm = top_blob_tm.channel(p + 1);
+        int* output2_tm = top_blob_tm.channel(p + 2);
+        int* output3_tm = top_blob_tm.channel(p + 3);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val01 = __lsx_vld(r0, 0);
+
+                        __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
+                        __m128i _val1 = __lsx_vilvh_d(_val01, _val01);
+
+                        __m128i _w0 = __lsx_vld(k0, 0);
+                        __m128i _w1 = __lsx_vld(k0 + 8, 0);
+
+                        __m128i _extval0 = __lsx_vslti_h(_val0, 0);
+                        __m128i _extval1 = __lsx_vslti_h(_val1, 0);
+                        __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                        __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval0, _val0);
+                        __m128i _val0h = __lsx_vilvh_h(_extval0, _val0);
+                        __m128i _val1l = __lsx_vilvl_h(_extval1, _val1);
+                        __m128i _val1h = __lsx_vilvh_h(_extval1, _val1);
+
+                        __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                        __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                        __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                        __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+
+                        _sum00 = __lsx_vmadd_w(_sum00, _val0l, _w0l);
+                        _sum01 = __lsx_vmadd_w(_sum01, _val0h, _w0h);
+                        _sum02 = __lsx_vmadd_w(_sum02, _val0l, _w1l);
+                        _sum03 = __lsx_vmadd_w(_sum03, _val0h, _w1h);
+                        _sum10 = __lsx_vmadd_w(_sum10, _val1l, _w0l);
+                        _sum11 = __lsx_vmadd_w(_sum11, _val1h, _w0h);
+                        _sum12 = __lsx_vmadd_w(_sum12, _val1l, _w1l);
+                        _sum13 = __lsx_vmadd_w(_sum13, _val1h, _w1h);
+
+                        r0 += 8;
+                        k0 += 16;
+                    }
+
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                        _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                        _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                        _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                        _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                        _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                        _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                        _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+                    }
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                        _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                        _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                        _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                        _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                        _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                        _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                        _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+                    }
+
+                    _sum00 = __lsx_vadd_w(_sum00, _sum01);
+                    _sum02 = __lsx_vadd_w(_sum02, _sum03);
+                    _sum10 = __lsx_vadd_w(_sum10, _sum11);
+                    _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+                    _sum00 = __lsx_vadd_w(_sum00, _sum02);
+                    _sum10 = __lsx_vadd_w(_sum10, _sum12);
+                }
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    __m128i _val0 = __lsx_vreplgr2vr_h(r0[0]);
+                    __m128i _val1 = __lsx_vreplgr2vr_h(r0[1]);
+                    __m128i _val = __lsx_vilvl_d(_val1, _val0);
+
+                    __m128i _w16 = __lsx_vld(k0, 0);
+
+                    _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                    __m128i _extval = __lsx_vslti_h(_val, 0);
+                    __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+
+                    __m128i _vall = __lsx_vilvl_h(_extval, _val);
+                    __m128i _valh = __lsx_vilvh_h(_extval, _val);
+                    __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+                    __m128i _w0h = __lsx_vilvh_h(_extw16, _w16);
+
+                    _sum00 = __lsx_vmadd_w(_sum00, _vall, _w0l);
+                    _sum10 = __lsx_vmadd_w(_sum10, _valh, _w0h);
+
+                    r0 += 2;
+                    k0 += 4;
+                }
+
+                int sum[8];
+                __lsx_vst(_sum00, sum, 0);
+                __lsx_vst(_sum10, sum + 4, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm[1] = sum[4];
+                output1_tm[1] = sum[5];
+                output2_tm[1] = sum[6];
+                output3_tm[1] = sum[7];
+                output0_tm += 2;
+                output1_tm += 2;
+                output2_tm += 2;
+                output3_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val16 = __lsx_vld(r0, 0);
+
+                        _val16 = __lsx_vilvl_d(_val16, _val16);
+
+                        __m128i _w0 = __lsx_vld(k0, 0);
+                        __m128i _w1 = __lsx_vld(k0 + 8, 0);
+
+                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
+                        __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                        __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
+                        __m128i _val0h = __lsx_vilvh_h(_extval16, _val16);
+
+                        __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                        __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                        __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                        __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+
+                        _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l);
+                        _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h);
+                        _sum2 = __lsx_vmadd_w(_sum2, _val0l, _w1l);
+                        _sum3 = __lsx_vmadd_w(_sum3, _val0h, _w1h);
+
+                        r0 += 4;
+                        k0 += 16;
+                    }
+
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                        _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                        _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                        _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                        _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                        _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                        _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                        _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                    }
+
+                    _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                    _sum2 = __lsx_vadd_w(_sum2, _sum3);
+                    _sum0 = __lsx_vadd_w(_sum0, _sum2);
+                }
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    __m128i _val = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _w16 = __lsx_vld(k0, 0);
+
+                    __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+                    __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _val, _w0l);
+
+                    r0 += 1;
+                    k0 += 4;
+                }
+
+                int sum[4];
+                __lsx_vst(_sum0, sum, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm += 1;
+                output1_tm += 1;
+                output2_tm += 1;
+                output3_tm += 1;
+            }
+        }
+    }
+#else // __loongarch_sx
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        int* output0_tm = top_blob_tm.channel(p);
+        int* output1_tm = top_blob_tm.channel(p + 1);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 2);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum00 = 0;
+                int sum01 = 0;
+                int sum10 = 0;
+                int sum11 = 0;
+
+                int nn1 = inch;
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    signed short val0 = r0[0];
+                    signed short val1 = r0[1];
+                    signed short w0 = k0[0];
+                    signed short w1 = k0[1];
+
+                    sum00 += val0 * w0;
+                    sum01 += val1 * w0;
+                    sum10 += val0 * w1;
+                    sum11 += val1 * w1;
+
+                    r0 += 2;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum00;
+                output0_tm[1] = sum01;
+                output1_tm[0] = sum10;
+                output1_tm[1] = sum11;
+                output0_tm += 2;
+                output1_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+                int nn1 = inch;
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    signed short val0 = r0[0];
+                    signed short w0 = k0[0];
+                    signed short w1 = k0[1];
+
+                    sum0 += val0 * w0;
+                    sum1 += val0 * w1;
+
+                    r0 += 1;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum0;
+                output1_tm[0] = sum1;
+                output0_tm += 1;
+                output1_tm += 1;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* output0_tm = top_blob_tm.channel(p);
+
+#if __loongarch_sx
+        const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
+#else
+        const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
+#endif
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+#if __loongarch_sx
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val16 = __lsx_vld(r0, 0);
+
+                        __m128i _w16 = __lsx_vld(k0, 0);
+
+                        _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
+                        __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
+                        __m128i _val0h = __lsx_vilvh_h(_extval16, _val16);
+
+                        __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+                        __m128i _w0h = __lsx_vilvh_h(_extw16, _w16);
+
+                        _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l);
+                        _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h);
+
+                        r0 += 8;
+                        k0 += 4;
+                    }
+
+                    sum0 = __lsx_reduce_add_w(_sum0);
+                    sum1 = __lsx_reduce_add_w(_sum1);
+                }
+#else  // __loongarch_sx
+                int nn1 = inch;
+#endif // __loongarch_sx
+
+                for (int q = 0; q < nn1; q++)
+                {
+                    signed short val0 = r0[0];
+                    signed short val1 = r0[1];
+                    signed short w = k0[0];
+
+                    sum0 += val0 * w;
+                    sum1 += val1 * w;
+
+                    k0 += 1;
+                    r0 += 2;
+                }
+
+                output0_tm[0] = sum0;
+                output0_tm[1] = sum1;
+                output0_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum = 0;
+
+#if __loongarch_sx
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val16 = __lsx_vld(r0, 0);
+                        __m128i _w16 = __lsx_vld(k0, 0);
+
+                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
+                        __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
+                        __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+
+                        _sum = __lsx_vmadd_w(_sum, _val0l, _w0l);
+
+                        r0 += 4;
+                        k0 += 4;
+                    }
+
+                    sum = __lsx_reduce_add_w(_sum);
+                }
+#else  // __loongarch_sx
+                int nn1 = inch;
+#endif // __loongarch_sx
+
+                for (int q = 0; q < nn1; q++)
+                {
+                    signed short val = r0[0];
+                    signed short w = k0[0];
+
+                    sum += val * w;
+
+                    k0 += 1;
+                    r0 += 1;
+                }
+
+                output0_tm[0] = sum;
+                output0_tm++;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_pack4.h b/src/layer/loongarch/convolution_winograd_dot_pack4.h
new file mode 100644
index 00000000000..66002a62a62
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_pack4.h
@@ -0,0 +1,448 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_pack4_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 4, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 12)
+        bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, batch, 16u, 4, opt.workspace_allocator);
+    else if (tiles >= 8)
+        bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
+    else if (tiles >= 4)
+        bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
+    else if (tiles >= 2)
+        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
+    else // if (tiles >= 1)
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 4, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 11 < tiles; i += 12)
+        {
+            float* tmpptr = tm2.row(i / 12);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x8
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+                __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0);
+                __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0);
+                __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0);
+                __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0);
+                __m128i _r8 = __lsx_vld(r0 + 4 * 8, 0);
+                __m128i _r9 = __lsx_vld(r0 + 4 * 9, 0);
+                __m128i _ra = __lsx_vld(r0 + 4 * 10, 0);
+                __m128i _rb = __lsx_vld(r0 + 4 * 11, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
+                __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
+                __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
+                __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+                __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
+                __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
+                __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
+                __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);
+
+                __lsx_vst(_r0123_0, tmpptr, 0);
+                __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
+                __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
+                __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
+                __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
+                __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
+                __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
+                __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
+                __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
+                __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
+                __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 48;
+            }
+        }
+        for (; i + 7 < tiles; i += 8)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x8
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+                __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0);
+                __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0);
+                __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0);
+                __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+
+                __lsx_vst(_r0123_0, tmpptr, 0);
+                __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
+                __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
+                __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
+                __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
+                __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
+                __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 32;
+            }
+        }
+        for (; i + 3 < tiles; i += 4)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x4
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                __lsx_vst(_r0123_0, tmpptr, 0);
+                __lsx_vst(_r0123_1, tmpptr + 4, 0);
+                __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
+                __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 16;
+            }
+        }
+        for (; i + 1 < tiles; i += 2)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x2
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+
+                __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0);
+
+                __lsx_vst(_r01_0, tmpptr, 0);
+                __lsx_vst(_r01_1, tmpptr + 4, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 8;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _val = __lsx_vld(r0, 0);
+                __lsx_vst(_val, tmpptr, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 4;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* output0_tm = top_blob_tm.channel(p);
+
+        const Mat kernel0_tm = kernel_tm.channel(p);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 11 < tiles; i += 12)
+            {
+                const float* r0 = bb2.row(i / 12);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum8 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum9 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _suma = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sumb = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 48);
+                    __builtin_prefetch(k0 + 16);
+                    __m128i _val0123 = __lsx_vld(r0, 0);
+                    __m128i _val4567 = __lsx_vld(r0 + 4, 0);
+                    __m128i _val89ab = __lsx_vld(r0 + 8, 0);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                    _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                    _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                    _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                    _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+                    _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8);
+                    _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9);
+                    _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma);
+                    _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb);
+
+                    r0 += 12;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
+                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
+                __lsx_vst(_sum4, output0_tm + 4 * 4, 0);
+                __lsx_vst(_sum5, output0_tm + 4 * 5, 0);
+                __lsx_vst(_sum6, output0_tm + 4 * 6, 0);
+                __lsx_vst(_sum7, output0_tm + 4 * 7, 0);
+                __lsx_vst(_sum8, output0_tm + 4 * 8, 0);
+                __lsx_vst(_sum9, output0_tm + 4 * 9, 0);
+                __lsx_vst(_suma, output0_tm + 4 * 10, 0);
+                __lsx_vst(_sumb, output0_tm + 4 * 11, 0);
+
+                output0_tm += 4 * 12;
+            }
+            for (; i + 7 < tiles; i += 8)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 16);
+                    __m128i _val0123 = __lsx_vld(r0, 0);
+                    __m128i _val4567 = __lsx_vld(r0 + 4, 0);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                    _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                    _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                    _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                    _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+
+                    r0 += 8;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
+                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
+                __lsx_vst(_sum4, output0_tm + 4 * 4, 0);
+                __lsx_vst(_sum5, output0_tm + 4 * 5, 0);
+                __lsx_vst(_sum6, output0_tm + 4 * 6, 0);
+                __lsx_vst(_sum7, output0_tm + 4 * 7, 0);
+
+                output0_tm += 4 * 8;
+            }
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 16);
+                    __m128i _val0123 = __lsx_vld(r0, 0);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+
+                    r0 += 4;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
+                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
+
+                output0_tm += 4 * 4;
+            }
+            for (; i + 1 < tiles; i += 2)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 8);
+                    __builtin_prefetch(k0 + 16);
+                    __m128 _val0 = __lsx_vreplfr2vr_s(*r0++);
+                    __m128 _val1 = __lsx_vreplfr2vr_s(*r0++);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1);
+
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+
+                output0_tm += 4 * 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 4);
+                    __builtin_prefetch(k0 + 16);
+                    __m128 _val0 = __lsx_vreplfr2vr_s(*r0++);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum, output0_tm, 0);
+
+                output0_tm += 4;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h
new file mode 100644
index 00000000000..f87aa9ef558
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h
@@ -0,0 +1,363 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_pack8to1_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 2)
+        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
+    else // if (tiles >= 1)
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 1 < tiles; i += 2)
+        {
+            short* tmpptr = tm2.row<short>(i / 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 8, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                __lsx_vst(_r1, tmpptr + 8, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 16;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 8;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator);
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+
+    nn_outch = outch >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* output0_tm = top_blob_tm.channel(p);
+        int* output1_tm = top_blob_tm.channel(p + 1);
+        int* output2_tm = top_blob_tm.channel(p + 2);
+        int* output3_tm = top_blob_tm.channel(p + 3);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 64);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]);
+                    __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]);
+                    __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]);
+                    __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]);
+                    __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]);
+                    __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]);
+                    __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]);
+                    __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]);
+                    __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7);
+
+                    r0 += 16;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                int sum[8];
+                __lsx_vst(_sum0, sum, 0);
+                __lsx_vst(_sum2, sum + 4, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm[1] = sum[4];
+                output1_tm[1] = sum[5];
+                output2_tm[1] = sum[6];
+                output3_tm[1] = sum[7];
+                output0_tm += 2;
+                output1_tm += 2;
+                output2_tm += 2;
+                output3_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7);
+
+                    r0 += 8;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+
+                int sum[4];
+                __lsx_vst(_sum0, sum, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm += 1;
+                output1_tm += 1;
+                output2_tm += 1;
+                output3_tm += 1;
+            }
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* output0_tm = top_blob_tm.channel(p);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                for (int q = 0; q < inch; q++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 64);
+                    __m128i _val0 = __lsx_vld(r0, 0);
+                    __m128i _val1 = __lsx_vld(r0 + 8, 0);
+
+                    __m128i _extval0 = __lsx_vslti_h(_val0, 0);
+                    __m128i _extval1 = __lsx_vslti_h(_val1, 0);
+                    __m128i _val0l = __lsx_vilvl_h(_extval0, _val0);
+                    __m128i _val0h = __lsx_vilvh_h(_extval0, _val0);
+                    __m128i _val1l = __lsx_vilvl_h(_extval1, _val1);
+                    __m128i _val1h = __lsx_vilvh_h(_extval1, _val1);
+
+                    __m128i _w0 = __lsx_vld(k0, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0l);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0h);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1l);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1h);
+
+                    k0 += 8;
+                    r0 += 16;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                output0_tm[0] = __lsx_reduce_add_w(_sum0);
+                output0_tm[1] = __lsx_reduce_add_w(_sum2);
+                output0_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                for (int q = 0; q < inch; q++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 32);
+                    __m128i _val = __lsx_vld(r0, 0);
+
+                    __m128i _extval = __lsx_vslti_h(_val, 0);
+                    __m128i _vall = __lsx_vilvl_h(_extval, _val);
+                    __m128i _valh = __lsx_vilvh_h(_extval, _val);
+
+                    __m128i _w0 = __lsx_vld(k0, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _vall);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _valh);
+
+                    k0 += 8;
+                    r0 += 8;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+
+                output0_tm[0] = __lsx_reduce_add_w(_sum0);
+                output0_tm++;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h
new file mode 100644
index 00000000000..c20400cbf8c
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h
@@ -0,0 +1,233 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_pack8to4_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 2)
+        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
+    else // if (tiles >= 1)
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 1 < tiles; i += 2)
+        {
+            short* tmpptr = tm2.row<short>(i / 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 8, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                __lsx_vst(_r1, tmpptr + 8, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 16;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 8;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* output0_tm = top_blob_tm.channel(p);
+
+        const Mat kernel0_tm = kernel_tm.channel(p);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 64);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]);
+                    __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]);
+                    __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]);
+                    __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]);
+                    __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]);
+                    __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]);
+                    __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]);
+                    __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]);
+                    __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7);
+
+                    r0 += 16;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum2, output0_tm + 4, 0);
+
+                output0_tm += 8;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7);
+
+                    r0 += 8;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                output0_tm += 4;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform.h b/src/layer/loongarch/convolution_winograd_transform.h
new file mode 100644
index 00000000000..624600e95a0
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform.h
@@ -0,0 +1,405 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[6][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 4) + (j * 4);
+
+                for (int m = 0; m < 6; m++)
+                {
+                    float r00 = r0[0];
+                    float r01 = r0[1];
+                    float r02 = r0[2];
+                    float r03 = r0[3];
+                    float r04 = r0[4];
+                    float r05 = r0[5];
+
+                    float tmp0m = 4 * r00 - 5 * r02 + r04;
+                    float tmp1m = -4 * (r01 + r02) + r04 + r03;
+                    float tmp2m = 4 * (r01 - r02) + r04 - r03;
+                    float tmp3m = -2 * (r01 - r03) + r04 - r02;
+                    float tmp4m = 2 * (r01 - r03) + r04 - r02;
+                    float tmp5m = 4 * r01 - 5 * r03 + r05;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+                    tmp[4][m] = tmp4m;
+                    tmp[5][m] = tmp5m;
+
+                    r0 += w;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j);
+                float* r0_tm_1 = r0_tm_0 + tiles;
+                float* r0_tm_2 = r0_tm_0 + tiles * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 3;
+                float* r0_tm_4 = r0_tm_0 + tiles * 4;
+                float* r0_tm_5 = r0_tm_0 + tiles * 5;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+                    float tmp04 = tmp[m][4];
+                    float tmp05 = tmp[m][5];
+
+                    float r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04;
+                    float r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03;
+                    float r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03;
+                    float r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    float r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    float r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05;
+
+                    r0_tm_0[0] = r0tm0;
+                    r0_tm_1[0] = r0tm1;
+                    r0_tm_2[0] = r0tm2;
+                    r0_tm_3[0] = r0tm3;
+                    r0_tm_4[0] = r0tm4;
+                    r0_tm_5[0] = r0tm5;
+
+                    r0_tm_0 += tiles * 6;
+                    r0_tm_1 += tiles * 6;
+                    r0_tm_2 += tiles * 6;
+                    r0_tm_3 += tiles * 6;
+                    r0_tm_4 += tiles * 6;
+                    r0_tm_5 += tiles * 6;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        float bias0 = biasptr ? biasptr[p] : 0.f;
+
+        float tmp[4][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
+                const float* output0_tm_1 = output0_tm_0 + tiles;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 5;
+
+                float* output0 = out0.row(i * 4) + (j * 4);
+
+                for (int m = 0; m < 6; m++)
+                {
+                    float out0tm0 = output0_tm_0[0];
+                    float out0tm1 = output0_tm_1[0];
+                    float out0tm2 = output0_tm_2[0];
+                    float out0tm3 = output0_tm_3[0];
+                    float out0tm4 = output0_tm_4[0];
+                    float out0tm5 = output0_tm_5[0];
+
+                    float tmp02a = out0tm1 + out0tm2;
+                    float tmp13a = out0tm1 - out0tm2;
+
+                    float tmp02b = out0tm3 + out0tm4;
+                    float tmp13b = out0tm3 - out0tm4;
+
+                    float tmp0m = out0tm0 + tmp02a + tmp02b;
+                    float tmp1m = tmp13a + tmp13b * 2;
+                    float tmp2m = tmp02a + tmp02b * 4;
+                    float tmp3m = out0tm5 + tmp13a + tmp13b * 8;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+
+                    output0_tm_0 += tiles * 6;
+                    output0_tm_1 += tiles * 6;
+                    output0_tm_2 += tiles * 6;
+                    output0_tm_3 += tiles * 6;
+                    output0_tm_4 += tiles * 6;
+                    output0_tm_5 += tiles * 6;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+                    float tmp04 = tmp[m][4];
+                    float tmp05 = tmp[m][5];
+
+                    float tmp02a = tmp01 + tmp02;
+                    float tmp13a = tmp01 - tmp02;
+
+                    float tmp02b = tmp03 + tmp04;
+                    float tmp13b = tmp03 - tmp04;
+
+                    float out00 = bias0 + tmp00 + tmp02a + tmp02b;
+                    float out01 = bias0 + tmp13a + tmp13b * 2;
+                    float out02 = bias0 + tmp02a + tmp02b * 4;
+                    float out03 = bias0 + tmp05 + tmp13a + tmp13b * 8;
+
+                    output0[0] = out00;
+                    output0[1] = out01;
+                    output0[2] = out02;
+                    output0[3] = out03;
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 2;
+    const int h_tiles = (h - 2) / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[4][4] = {
+    //     {1.0f,  0.0f, -1.0f,  0.0f},
+    //     {0.0f,  1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  0.00f, 1.0f}
+    // };
+
+    // 0 = r00 - r02
+    // 1 = r01 + r02
+    // 2 = r02 - r01
+    // 3 = r03 - r01
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[4][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 2) + (j * 2);
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float r00 = r0[0];
+                    float r01 = r0[1];
+                    float r02 = r0[2];
+                    float r03 = r0[3];
+
+                    float tmp0m = r00 - r02;
+                    float tmp1m = r01 + r02;
+                    float tmp2m = r02 - r01;
+                    float tmp3m = r03 - r01;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+
+                    r0 += w;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j);
+                float* r0_tm_1 = r0_tm_0 + tiles;
+                float* r0_tm_2 = r0_tm_0 + tiles * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 3;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+
+                    float r0tm0 = tmp00 - tmp02;
+                    float r0tm1 = tmp01 + tmp02;
+                    float r0tm2 = tmp02 - tmp01;
+                    float r0tm3 = tmp03 - tmp01;
+
+                    r0_tm_0[0] = r0tm0;
+                    r0_tm_1[0] = r0tm1;
+                    r0_tm_2[0] = r0tm2;
+                    r0_tm_3[0] = r0tm3;
+
+                    r0_tm_0 += tiles * 4;
+                    r0_tm_1 += tiles * 4;
+                    r0_tm_2 += tiles * 4;
+                    r0_tm_3 += tiles * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 2;
+    const int h_tiles = outh / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[2][4] = {
+    //     {1.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f,  1.0f, -1.0f,  1.0f}
+    // };
+
+    // 0 = r00 + r01 + r02
+    // 1 = r01 - r02 + r03
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        float bias0 = biasptr ? biasptr[p] : 0.f;
+
+        float tmp[2][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
+                const float* output0_tm_1 = output0_tm_0 + tiles;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 3;
+
+                float* output0 = out0.row(i * 2) + (j * 2);
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float out0tm0 = output0_tm_0[0];
+                    float out0tm1 = output0_tm_1[0];
+                    float out0tm2 = output0_tm_2[0];
+                    float out0tm3 = output0_tm_3[0];
+
+                    float tmp0m = out0tm0 + out0tm1 + out0tm2;
+                    float tmp1m = out0tm1 - out0tm2 + out0tm3;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+
+                    output0_tm_0 += tiles * 4;
+                    output0_tm_1 += tiles * 4;
+                    output0_tm_2 += tiles * 4;
+                    output0_tm_3 += tiles * 4;
+                }
+
+                for (int m = 0; m < 2; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+
+                    float out00 = bias0 + tmp00 + tmp01 + tmp02;
+                    float out01 = bias0 + tmp01 - tmp02 + tmp03;
+
+                    output0[0] = out00;
+                    output0[1] = out01;
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_int8.h b/src/layer/loongarch/convolution_winograd_transform_int8.h
new file mode 100644
index 00000000000..09ef669e473
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_int8.h
@@ -0,0 +1,229 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_input_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        short tmp[6][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4);
+
+                for (int m = 0; m < 6; m++)
+                {
+                    signed char r00 = r0[0];
+                    signed char r01 = r0[1];
+                    signed char r02 = r0[2];
+                    signed char r03 = r0[3];
+                    signed char r04 = r0[4];
+                    signed char r05 = r0[5];
+
+                    short tmp0m = 4 * r00 - 5 * r02 + r04;
+                    short tmp1m = -4 * (r01 + r02) + r04 + r03;
+                    short tmp2m = 4 * (r01 - r02) + r04 - r03;
+                    short tmp3m = -2 * (r01 - r03) + r04 - r02;
+                    short tmp4m = 2 * (r01 - r03) + r04 - r02;
+                    short tmp5m = 4 * r01 - 5 * r03 + r05;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+                    tmp[4][m] = tmp4m;
+                    tmp[5][m] = tmp5m;
+
+                    r0 += w;
+                }
+
+                short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j);
+                short* r0_tm_1 = r0_tm_0 + tiles;
+                short* r0_tm_2 = r0_tm_0 + tiles * 2;
+                short* r0_tm_3 = r0_tm_0 + tiles * 3;
+                short* r0_tm_4 = r0_tm_0 + tiles * 4;
+                short* r0_tm_5 = r0_tm_0 + tiles * 5;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    short tmp00 = tmp[m][0];
+                    short tmp01 = tmp[m][1];
+                    short tmp02 = tmp[m][2];
+                    short tmp03 = tmp[m][3];
+                    short tmp04 = tmp[m][4];
+                    short tmp05 = tmp[m][5];
+
+                    short r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04;
+                    short r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03;
+                    short r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03;
+                    short r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    short r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    short r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05;
+
+                    r0_tm_0[0] = r0tm0;
+                    r0_tm_1[0] = r0tm1;
+                    r0_tm_2[0] = r0tm2;
+                    r0_tm_3[0] = r0tm3;
+                    r0_tm_4[0] = r0tm4;
+                    r0_tm_5[0] = r0tm5;
+
+                    r0_tm_0 += tiles * 6;
+                    r0_tm_1 += tiles * 6;
+                    r0_tm_2 += tiles * 6;
+                    r0_tm_3 += tiles * 6;
+                    r0_tm_4 += tiles * 6;
+                    r0_tm_5 += tiles * 6;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_output_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        int tmp[4][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 1;
+                const int* output0_tm_1 = output0_tm_0 + tiles * 1;
+                const int* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const int* output0_tm_3 = output0_tm_0 + tiles * 3;
+                const int* output0_tm_4 = output0_tm_0 + tiles * 4;
+                const int* output0_tm_5 = output0_tm_0 + tiles * 5;
+
+                int* output0 = out0.row<int>(i * 4) + j * 4;
+
+                for (int m = 0; m < 5; m++)
+                {
+                    int tmp02a = output0_tm_1[0] + output0_tm_2[0];
+                    int tmp13a = output0_tm_1[0] - output0_tm_2[0];
+
+                    int tmp02b = output0_tm_3[0] + output0_tm_4[0];
+                    int tmp13b = output0_tm_3[0] - output0_tm_4[0];
+
+                    tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b;
+                    tmp[1][m] = tmp13a + tmp13b * 2;
+                    tmp[2][m] = tmp02a + tmp02b * 4;
+                    tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8;
+
+                    output0_tm_0 += tiles * 6;
+                    output0_tm_1 += tiles * 6;
+                    output0_tm_2 += tiles * 6;
+                    output0_tm_3 += tiles * 6;
+                    output0_tm_4 += tiles * 6;
+                    output0_tm_5 += tiles * 6;
+                }
+                for (int m = 5; m < 6; m++)
+                {
+                    int tmp02a = output0_tm_1[0] + output0_tm_2[0];
+                    int tmp13a = output0_tm_1[0] - output0_tm_2[0];
+
+                    int tmp02b = output0_tm_3[0] + output0_tm_4[0];
+                    int tmp13b = output0_tm_3[0] - output0_tm_4[0];
+
+                    tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4;
+                    tmp[1][m] = (tmp13a + tmp13b * 2) * 4;
+                    tmp[2][m] = (tmp02a + tmp02b * 4) * 4;
+                    tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4;
+
+                    output0_tm_0 += tiles * 6;
+                    output0_tm_1 += tiles * 6;
+                    output0_tm_2 += tiles * 6;
+                    output0_tm_3 += tiles * 6;
+                    output0_tm_4 += tiles * 6;
+                    output0_tm_5 += tiles * 6;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    const int* tmp0 = tmp[m];
+
+                    int tmp02a = tmp0[1] + tmp0[2];
+                    int tmp13a = tmp0[1] - tmp0[2];
+
+                    int tmp02b = tmp0[3] + tmp0[4];
+                    int tmp13b = tmp0[3] - tmp0[4];
+
+                    output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576;
+                    output0[1] = (tmp13a + tmp13b * 2) / 576;
+                    output0[2] = (tmp02a + tmp02b * 4) / 576;
+                    output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576;
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4.h b/src/layer/loongarch/convolution_winograd_transform_pack4.h
new file mode 100644
index 00000000000..3969e59cf09
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_pack4.h
@@ -0,0 +1,730 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd63_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 6;
+    const int h_tiles = (h - 2) / 6;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[8][8] = {
+    //     {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
+    //
+    //     {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
+    //     {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
+    //     {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
+    //
+    //     {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
+    // };
+
+    // 0 = r00 - r06 + (r04 - r02) * 5.25
+    // 7 = r07 - r01 + (r03 - r05) * 5.25
+
+    // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
+    // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
+
+    // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+    // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+
+    // reuse r04 * 1.25
+    // reuse r03 * 2.5
+    // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+    // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[8][8][4];
+
+        __m128 _v5_25 = __lsx_vreplfr2vr_s(5.25f);
+        __m128 _vm4_25 = __lsx_vreplfr2vr_s(-4.25f);
+        __m128 _vm1_25 = __lsx_vreplfr2vr_s(-1.25f);
+        __m128 _v0_25 = __lsx_vreplfr2vr_s(0.25f);
+        __m128 _vm2_5 = __lsx_vreplfr2vr_s(-2.5f);
+        __m128 _v0_5 = __lsx_vreplfr2vr_s(0.5f);
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 6) + (j * 6) * 4;
+
+                for (int m = 0; m < 8; m++)
+                {
+                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                    __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+                    __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0);
+                    __m128 _r06 = (__m128)__lsx_vld(r0 + 4 * 6, 0);
+                    __m128 _r07 = (__m128)__lsx_vld(r0 + 4 * 7, 0);
+
+                    __m128 _tmp0m = __lsx_vfmadd_s(__lsx_vfsub_s(_r04, _r02), _v5_25, __lsx_vfsub_s(_r00, _r06));
+                    __m128 _tmp7m = __lsx_vfmadd_s(__lsx_vfsub_s(_r03, _r05), _v5_25, __lsx_vfsub_s(_r07, _r01));
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp7m, tmp[7][m], 0);
+
+                    __m128 _tmp12a = __lsx_vfmadd_s(_r04, _vm4_25, __lsx_vfadd_s(_r02, _r06));
+                    __m128 _tmp12b = __lsx_vfmadd_s(_r03, _vm4_25, __lsx_vfadd_s(_r01, _r05));
+
+                    __m128 _tmp1m = __lsx_vfadd_s(_tmp12a, _tmp12b);
+                    __m128 _tmp2m = __lsx_vfsub_s(_tmp12a, _tmp12b);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+
+                    __m128 _tmp34a = __lsx_vfmadd_s(_r04, _vm1_25, __lsx_vfmadd_s(_r02, _v0_25, _r06));
+                    __m128 _tmp34b = __lsx_vfmadd_s(_r05, _v2, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v0_5)));
+
+                    __m128 _tmp3m = __lsx_vfadd_s(_tmp34a, _tmp34b);
+                    __m128 _tmp4m = __lsx_vfsub_s(_tmp34a, _tmp34b);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+
+                    __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_r04, _vm1_25, _r02), _v4, _r06);
+                    __m128 _tmp56b = __lsx_vfmadd_s(_r05, _v0_5, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v2)));
+
+                    __m128 _tmp5m = __lsx_vfadd_s(_tmp56a, _tmp56b);
+                    __m128 _tmp6m = __lsx_vfsub_s(_tmp56a, _tmp56b);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+                    __lsx_vst(_tmp6m, tmp[6][m], 0);
+
+                    r0 += w * 4;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
+                float* r0_tm_1 = r0_tm_0 + tiles * 4;
+                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
+                float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4;
+                float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5;
+                float* r0_tm_6 = r0_tm_0 + tiles * 4 * 6;
+                float* r0_tm_7 = r0_tm_0 + tiles * 4 * 7;
+
+                for (int m = 0; m < 8; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+                    __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0);
+                    __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0);
+
+                    __m128 _r0tm0 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp04, _tmp02), _v5_25, __lsx_vfsub_s(_tmp00, _tmp06));
+                    __m128 _r0tm7 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp03, _tmp05), _v5_25, __lsx_vfsub_s(_tmp07, _tmp01));
+
+                    __m128 _tmp12a = __lsx_vfmadd_s(_tmp04, _vm4_25, __lsx_vfadd_s(_tmp02, _tmp06));
+                    __m128 _tmp12b = __lsx_vfmadd_s(_tmp03, _vm4_25, __lsx_vfadd_s(_tmp01, _tmp05));
+
+                    __m128 _r0tm1 = __lsx_vfadd_s(_tmp12a, _tmp12b);
+                    __m128 _r0tm2 = __lsx_vfsub_s(_tmp12a, _tmp12b);
+
+                    __m128 _tmp34a = __lsx_vfmadd_s(_tmp04, _vm1_25, __lsx_vfmadd_s(_tmp02, _v0_25, _tmp06));
+                    __m128 _tmp34b = __lsx_vfmadd_s(_tmp05, _v2, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v0_5)));
+
+                    __m128 _r0tm3 = __lsx_vfadd_s(_tmp34a, _tmp34b);
+                    __m128 _r0tm4 = __lsx_vfsub_s(_tmp34a, _tmp34b);
+
+                    __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_tmp04, _vm1_25, _tmp02), _v4, _tmp06);
+                    __m128 _tmp56b = __lsx_vfmadd_s(_tmp05, _v0_5, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v2)));
+
+                    __m128 _r0tm5 = __lsx_vfadd_s(_tmp56a, _tmp56b);
+                    __m128 _r0tm6 = __lsx_vfsub_s(_tmp56a, _tmp56b);
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+                    __lsx_vst(_r0tm4, r0_tm_4, 0);
+                    __lsx_vst(_r0tm5, r0_tm_5, 0);
+                    __lsx_vst(_r0tm6, r0_tm_6, 0);
+                    __lsx_vst(_r0tm7, r0_tm_7, 0);
+
+                    r0_tm_0 += tiles * 4 * 8;
+                    r0_tm_1 += tiles * 4 * 8;
+                    r0_tm_2 += tiles * 4 * 8;
+                    r0_tm_3 += tiles * 4 * 8;
+                    r0_tm_4 += tiles * 4 * 8;
+                    r0_tm_5 += tiles * 4 * 8;
+                    r0_tm_6 += tiles * 4 * 8;
+                    r0_tm_7 += tiles * 4 * 8;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd63_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 6;
+    const int h_tiles = outh / 6;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[6][8] = {
+    //     {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
+    // };
+
+    // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
+    // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
+    // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
+    // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
+    // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
+    // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        float tmp[6][8][4];
+
+        __m128 _v32 = __lsx_vreplfr2vr_s(32.f);
+        __m128 _v16 = __lsx_vreplfr2vr_s(16.f);
+        __m128 _v8 = __lsx_vreplfr2vr_s(8.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
+                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5;
+                const float* output0_tm_6 = output0_tm_0 + tiles * 4 * 6;
+                const float* output0_tm_7 = output0_tm_0 + tiles * 4 * 7;
+
+                float* output0 = out0.row<float>(i * 6) + (j * 6) * 4;
+
+                for (int m = 0; m < 8; m++)
+                {
+                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
+                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
+                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
+                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
+                    __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0);
+                    __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0);
+                    __m128 _out0tm6 = (__m128)__lsx_vld(output0_tm_6, 0);
+                    __m128 _out0tm7 = (__m128)__lsx_vld(output0_tm_7, 0);
+
+                    __m128 _tmp024a = __lsx_vfadd_s(_out0tm1, _out0tm2);
+                    __m128 _tmp135a = __lsx_vfsub_s(_out0tm1, _out0tm2);
+
+                    __m128 _tmp024b = __lsx_vfadd_s(_out0tm3, _out0tm4);
+                    __m128 _tmp135b = __lsx_vfsub_s(_out0tm3, _out0tm4);
+
+                    __m128 _tmp024c = __lsx_vfadd_s(_out0tm5, _out0tm6);
+                    __m128 _tmp135c = __lsx_vfsub_s(_out0tm5, _out0tm6);
+
+                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b));
+                    __m128 _tmp2m = __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a));
+                    __m128 _tmp4m = __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a));
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+
+                    __m128 _tmp1m = __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a));
+                    __m128 _tmp3m = __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a));
+                    __m128 _tmp5m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm7, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c));
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+
+                    output0_tm_0 += tiles * 4 * 8;
+                    output0_tm_1 += tiles * 4 * 8;
+                    output0_tm_2 += tiles * 4 * 8;
+                    output0_tm_3 += tiles * 4 * 8;
+                    output0_tm_4 += tiles * 4 * 8;
+                    output0_tm_5 += tiles * 4 * 8;
+                    output0_tm_6 += tiles * 4 * 8;
+                    output0_tm_7 += tiles * 4 * 8;
+                }
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+                    __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0);
+                    __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0);
+
+                    __m128 _tmp024a = __lsx_vfadd_s(_tmp01, _tmp02);
+                    __m128 _tmp135a = __lsx_vfsub_s(_tmp01, _tmp02);
+
+                    __m128 _tmp024b = __lsx_vfadd_s(_tmp03, _tmp04);
+                    __m128 _tmp135b = __lsx_vfsub_s(_tmp03, _tmp04);
+
+                    __m128 _tmp024c = __lsx_vfadd_s(_tmp05, _tmp06);
+                    __m128 _tmp135c = __lsx_vfsub_s(_tmp05, _tmp06);
+
+                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b)));
+                    __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a)));
+                    __m128 _out04 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a)));
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out02, output0 + 4 * 2, 0);
+                    __lsx_vst(_out04, output0 + 4 * 4, 0);
+
+                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a)));
+                    __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a)));
+                    __m128 _out05 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp07, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c)));
+                    __lsx_vst(_out01, output0 + 4, 0);
+                    __lsx_vst(_out03, output0 + 4 * 3, 0);
+                    __lsx_vst(_out05, output0 + 4 * 5, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[6][6][4];
+
+        __m128 _vm5 = __lsx_vreplfr2vr_s(-5.f);
+        __m128 _vm4 = __lsx_vreplfr2vr_s(-4.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+        __m128 _vm2 = __lsx_vreplfr2vr_s(-2.f);
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 4) + (j * 4) * 4;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                    __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+                    __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0);
+
+                    __m128 _tmp0m = __lsx_vfmadd_s(_r02, _vm5, __lsx_vfmadd_s(_r00, _v4, _r04));
+                    __m128 _tmp1m = __lsx_vfmadd_s(__lsx_vfadd_s(_r01, _r02), _vm4, __lsx_vfadd_s(_r04, _r03));
+                    __m128 _tmp2m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r02), _v4, __lsx_vfsub_s(_r04, _r03));
+                    __m128 _tmp3m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _vm2, __lsx_vfsub_s(_r04, _r02));
+                    __m128 _tmp4m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _v2, __lsx_vfsub_s(_r04, _r02));
+                    __m128 _tmp5m = __lsx_vfmadd_s(_r03, _vm5, __lsx_vfmadd_s(_r01, _v4, _r05));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+
+                    r0 += w * 4;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
+                float* r0_tm_1 = r0_tm_0 + tiles * 4;
+                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
+                float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4;
+                float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+
+                    __m128 _r0tm0 = __lsx_vfmadd_s(_tmp02, _vm5, __lsx_vfmadd_s(_tmp00, _v4, _tmp04));
+                    __m128 _r0tm1 = __lsx_vfmadd_s(__lsx_vfadd_s(_tmp01, _tmp02), _vm4, __lsx_vfadd_s(_tmp04, _tmp03));
+                    __m128 _r0tm2 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _v4, __lsx_vfsub_s(_tmp04, _tmp03));
+                    __m128 _r0tm3 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _vm2, __lsx_vfsub_s(_tmp04, _tmp02));
+                    __m128 _r0tm4 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _v2, __lsx_vfsub_s(_tmp04, _tmp02));
+                    __m128 _r0tm5 = __lsx_vfmadd_s(_tmp03, _vm5, __lsx_vfmadd_s(_tmp01, _v4, _tmp05));
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+                    __lsx_vst(_r0tm4, r0_tm_4, 0);
+                    __lsx_vst(_r0tm5, r0_tm_5, 0);
+
+                    r0_tm_0 += tiles * 4 * 6;
+                    r0_tm_1 += tiles * 4 * 6;
+                    r0_tm_2 += tiles * 4 * 6;
+                    r0_tm_3 += tiles * 4 * 6;
+                    r0_tm_4 += tiles * 4 * 6;
+                    r0_tm_5 += tiles * 4 * 6;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        float tmp[4][6][4];
+
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+        __m128 _v8 = __lsx_vreplfr2vr_s(8.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
+                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5;
+
+                float* output0 = out0.row<float>(i * 4) + (j * 4) * 4;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
+                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
+                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
+                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
+                    __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0);
+                    __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0);
+
+                    __m128 _tmp02a = __lsx_vfadd_s(_out0tm1, _out0tm2);
+                    __m128 _tmp13a = __lsx_vfsub_s(_out0tm1, _out0tm2);
+
+                    __m128 _tmp02b = __lsx_vfadd_s(_out0tm3, _out0tm4);
+                    __m128 _tmp13b = __lsx_vfsub_s(_out0tm3, _out0tm4);
+
+                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp02a), _tmp02b);
+                    __m128 _tmp1m = __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a);
+                    __m128 _tmp2m = __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a);
+                    __m128 _tmp3m = __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_out0tm5, _tmp13a));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    output0_tm_0 += tiles * 4 * 6;
+                    output0_tm_1 += tiles * 4 * 6;
+                    output0_tm_2 += tiles * 4 * 6;
+                    output0_tm_3 += tiles * 4 * 6;
+                    output0_tm_4 += tiles * 4 * 6;
+                    output0_tm_5 += tiles * 4 * 6;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+
+                    __m128 _tmp02a = __lsx_vfadd_s(_tmp01, _tmp02);
+                    __m128 _tmp13a = __lsx_vfsub_s(_tmp01, _tmp02);
+
+                    __m128 _tmp02b = __lsx_vfadd_s(_tmp03, _tmp04);
+                    __m128 _tmp13b = __lsx_vfsub_s(_tmp03, _tmp04);
+
+                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp02a), _tmp02b));
+                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a));
+                    __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a));
+                    __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_tmp05, _tmp13a)));
+
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out01, output0 + 4, 0);
+                    __lsx_vst(_out02, output0 + 4 * 2, 0);
+                    __lsx_vst(_out03, output0 + 4 * 3, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 2;
+    const int h_tiles = (h - 2) / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[4][4] = {
+    //     {1.0f,  0.0f, -1.0f,  0.0f},
+    //     {0.0f,  1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  0.00f, 1.0f}
+    // };
+
+    // 0 = r00 - r02
+    // 1 = r01 + r02
+    // 2 = r02 - r01
+    // 3 = r03 - r01
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[4][4][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 2) + (j * 2) * 4;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+
+                    __m128 _tmp0m = __lsx_vfsub_s(_r00, _r02);
+                    __m128 _tmp1m = __lsx_vfadd_s(_r01, _r02);
+                    __m128 _tmp2m = __lsx_vfsub_s(_r02, _r01);
+                    __m128 _tmp3m = __lsx_vfsub_s(_r03, _r01);
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    r0 += w * 4;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
+                float* r0_tm_1 = r0_tm_0 + tiles * 4;
+                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+
+                    __m128 _r0tm0 = __lsx_vfsub_s(_tmp00, _tmp02);
+                    __m128 _r0tm1 = __lsx_vfadd_s(_tmp01, _tmp02);
+                    __m128 _r0tm2 = __lsx_vfsub_s(_tmp02, _tmp01);
+                    __m128 _r0tm3 = __lsx_vfsub_s(_tmp03, _tmp01);
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+
+                    r0_tm_0 += tiles * 4 * 4;
+                    r0_tm_1 += tiles * 4 * 4;
+                    r0_tm_2 += tiles * 4 * 4;
+                    r0_tm_3 += tiles * 4 * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 2;
+    const int h_tiles = outh / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[2][4] = {
+    //     {1.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f,  1.0f, -1.0f,  1.0f}
+    // };
+
+    // 0 = r00 + r01 + r02
+    // 1 = r01 - r02 + r03
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        float tmp[2][4][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
+                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
+
+                float* output0 = out0.row<float>(i * 2) + (j * 2) * 4;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
+                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
+                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
+                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
+
+                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _out0tm1), _out0tm2);
+                    __m128 _tmp1m = __lsx_vfadd_s(__lsx_vfsub_s(_out0tm1, _out0tm2), _out0tm3);
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+
+                    output0_tm_0 += tiles * 4 * 4;
+                    output0_tm_1 += tiles * 4 * 4;
+                    output0_tm_2 += tiles * 4 * 4;
+                    output0_tm_3 += tiles * 4 * 4;
+                }
+
+                for (int m = 0; m < 2; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+
+                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp01), _tmp02));
+                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _tmp03));
+
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out01, output0 + 4, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h
new file mode 100644
index 00000000000..8b31ce97a86
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h
@@ -0,0 +1,166 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_output_pack4_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        int tmp[4][6][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 4;
+                const int* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const int* output0_tm_2 = output0_tm_0 + tiles * 8;
+                const int* output0_tm_3 = output0_tm_0 + tiles * 12;
+                const int* output0_tm_4 = output0_tm_0 + tiles * 16;
+                const int* output0_tm_5 = output0_tm_0 + tiles * 20;
+
+                int* output0 = out0.row<int>(i * 4) + (j * 4) * 4;
+
+                for (int m = 0; m < 5; m++)
+                {
+                    __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0);
+                    __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0);
+                    __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0);
+                    __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0);
+                    __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0);
+                    __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0);
+
+                    __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2);
+                    __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2);
+
+                    __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4);
+                    __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4);
+
+                    __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b);
+                    __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
+                    __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
+                    __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    output0_tm_0 += tiles * 24;
+                    output0_tm_1 += tiles * 24;
+                    output0_tm_2 += tiles * 24;
+                    output0_tm_3 += tiles * 24;
+                    output0_tm_4 += tiles * 24;
+                    output0_tm_5 += tiles * 24;
+                }
+                for (int m = 5; m < 6; m++)
+                {
+                    __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0);
+                    __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0);
+                    __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0);
+                    __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0);
+                    __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0);
+                    __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0);
+
+                    __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2);
+                    __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2);
+
+                    __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4);
+                    __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4);
+
+                    __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b);
+                    __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
+                    __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
+                    __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3));
+
+                    _tmp0m = __lsx_vslli_w(_tmp0m, 2);
+                    _tmp1m = __lsx_vslli_w(_tmp1m, 2);
+                    _tmp2m = __lsx_vslli_w(_tmp2m, 2);
+                    _tmp3m = __lsx_vslli_w(_tmp3m, 2);
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    output0_tm_0 += tiles * 24;
+                    output0_tm_1 += tiles * 24;
+                    output0_tm_2 += tiles * 24;
+                    output0_tm_3 += tiles * 24;
+                    output0_tm_4 += tiles * 24;
+                    output0_tm_5 += tiles * 24;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128i _tmp00 = __lsx_vld(tmp[m][0], 0);
+                    __m128i _tmp01 = __lsx_vld(tmp[m][1], 0);
+                    __m128i _tmp02 = __lsx_vld(tmp[m][2], 0);
+                    __m128i _tmp03 = __lsx_vld(tmp[m][3], 0);
+                    __m128i _tmp04 = __lsx_vld(tmp[m][4], 0);
+                    __m128i _tmp05 = __lsx_vld(tmp[m][5], 0);
+
+                    __m128i _tmp02a = __lsx_vadd_w(_tmp01, _tmp02);
+                    __m128i _tmp13a = __lsx_vsub_w(_tmp01, _tmp02);
+
+                    __m128i _tmp02b = __lsx_vadd_w(_tmp03, _tmp04);
+                    __m128i _tmp13b = __lsx_vsub_w(_tmp03, _tmp04);
+
+                    __m128i _out00 = __lsx_vadd_w(__lsx_vadd_w(_tmp00, _tmp02a), _tmp02b);
+                    __m128i _out01 = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
+                    __m128i _out02 = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
+                    __m128i _out03 = __lsx_vadd_w(__lsx_vadd_w(_tmp05, _tmp13a), __lsx_vslli_w(_tmp13b, 3));
+
+                    // TODO use integer trick for division by 576
+                    __m128 _v576 = __lsx_vreplfr2vr_s(1.0 / 576);
+                    _out00 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out00), _v576));
+                    _out01 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out01), _v576));
+                    _out02 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out02), _v576));
+                    _out03 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out03), _v576));
+
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out01, output0 + 4, 0);
+                    __lsx_vst(_out02, output0 + 8, 0);
+                    __lsx_vst(_out03, output0 + 12, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h
new file mode 100644
index 00000000000..5e49a87669a
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h
@@ -0,0 +1,132 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_input_pack8_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        short tmp[6][6][8];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128i _r00_01 = __lsx_vld(r0, 0);
+                    __m128i _r02_03 = __lsx_vld(r0 + 16, 0);
+                    __m128i _r04_05 = __lsx_vld(r0 + 32, 0);
+                    __m128i _extr0001 = __lsx_vslti_b(_r00_01, 0);
+                    __m128i _extr0203 = __lsx_vslti_b(_r02_03, 0);
+                    __m128i _extr0405 = __lsx_vslti_b(_r04_05, 0);
+                    __m128i _r00 = __lsx_vilvl_b(_extr0001, _r00_01);
+                    __m128i _r01 = __lsx_vilvh_b(_extr0001, _r00_01);
+                    __m128i _r02 = __lsx_vilvl_b(_extr0203, _r02_03);
+                    __m128i _r03 = __lsx_vilvh_b(_extr0203, _r02_03);
+                    __m128i _r04 = __lsx_vilvl_b(_extr0405, _r04_05);
+                    __m128i _r05 = __lsx_vilvh_b(_extr0405, _r04_05);
+
+                    __m128i _v5 = __lsx_vreplgr2vr_h(5);
+
+                    __m128i _tmp0m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r00, 2), _r04), __lsx_vmul_h(_r02, _v5));
+                    __m128i _tmp1m = __lsx_vsub_h(__lsx_vadd_h(_r04, _r03), __lsx_vslli_h(__lsx_vadd_h(_r01, _r02), 2));
+                    __m128i _tmp2m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r03), __lsx_vslli_h(__lsx_vsub_h(_r01, _r02), 2));
+                    __m128i _tmp3m = __lsx_vsub_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1));
+                    __m128i _tmp4m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1));
+                    __m128i _tmp5m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r01, 2), _r05), __lsx_vmul_h(_r03, _v5));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+
+                    r0 += w * 8;
+                }
+
+                short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j) * 8;
+                short* r0_tm_1 = r0_tm_0 + tiles * 8;
+                short* r0_tm_2 = r0_tm_0 + tiles * 16;
+                short* r0_tm_3 = r0_tm_0 + tiles * 24;
+                short* r0_tm_4 = r0_tm_0 + tiles * 32;
+                short* r0_tm_5 = r0_tm_0 + tiles * 40;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128i _tmp00 = __lsx_vld(tmp[m][0], 0);
+                    __m128i _tmp01 = __lsx_vld(tmp[m][1], 0);
+                    __m128i _tmp02 = __lsx_vld(tmp[m][2], 0);
+                    __m128i _tmp03 = __lsx_vld(tmp[m][3], 0);
+                    __m128i _tmp04 = __lsx_vld(tmp[m][4], 0);
+                    __m128i _tmp05 = __lsx_vld(tmp[m][5], 0);
+
+                    __m128i _v5 = __lsx_vreplgr2vr_h(5);
+
+                    __m128i _r0tm0 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp00, 2), _tmp04), __lsx_vmul_h(_tmp02, _v5));
+                    __m128i _r0tm1 = __lsx_vsub_h(__lsx_vadd_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vadd_h(_tmp01, _tmp02), 2));
+                    __m128i _r0tm2 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp02), 2));
+                    __m128i _r0tm3 = __lsx_vsub_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1));
+                    __m128i _r0tm4 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1));
+                    __m128i _r0tm5 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp01, 2), _tmp05), __lsx_vmul_h(_tmp03, _v5));
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+                    __lsx_vst(_r0tm4, r0_tm_4, 0);
+                    __lsx_vst(_r0tm5, r0_tm_5, 0);
+
+                    r0_tm_0 += tiles * 48;
+                    r0_tm_1 += tiles * 48;
+                    r0_tm_2 += tiles * 48;
+                    r0_tm_3 += tiles * 48;
+                    r0_tm_4 += tiles * 48;
+                    r0_tm_5 += tiles * 48;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_3x3.h b/src/layer/loongarch/convolutiondepthwise_3x3.h
new file mode 100644
index 00000000000..1c37f7789f3
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_3x3.h
@@ -0,0 +1,193 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const float bias0 = bias ? bias[g] : 0.f;
+
+        const float* kernel0 = kernel + g * 9;
+
+        float* outptr0 = out;
+        float* outptr1 = outptr0 + outw;
+
+        const float* img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+        const float* r2 = img0 + w * 2;
+        const float* r3 = img0 + w * 3;
+
+        const float* k0 = kernel0;
+        const float* k1 = kernel0 + 3;
+        const float* k2 = kernel0 + 6;
+
+        int i = 0;
+
+        for (; i + 1 < outh; i += 2)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = bias0;
+                float sum2 = bias0;
+
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum2 += r1[0] * k0[0];
+                sum2 += r1[1] * k0[1];
+                sum2 += r1[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum2 += r2[0] * k1[0];
+                sum2 += r2[1] * k1[1];
+                sum2 += r2[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+                sum2 += r3[0] * k2[0];
+                sum2 += r3[1] * k2[1];
+                sum2 += r3[2] * k2[2];
+
+                *outptr0 = sum;
+                *outptr1 = sum2;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                outptr0++;
+                outptr1++;
+            }
+
+            r0 += 2 + w;
+            r1 += 2 + w;
+            r2 += 2 + w;
+            r3 += 2 + w;
+
+            outptr0 += outw;
+            outptr1 += outw;
+        }
+
+        for (; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = bias0;
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+
+                *outptr0 = sum;
+
+                r0++;
+                r1++;
+                r2++;
+                outptr0++;
+            }
+
+            r0 += 2;
+            r1 += 2;
+            r2 += 2;
+        }
+    }
+}
+
+static void convdw3x3s2_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const float bias0 = bias ? bias[g] : 0.f;
+
+        const float* kernel0 = kernel + g * 9;
+
+        float* outptr = out;
+
+        const float* img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+        const float* r2 = img0 + w * 2;
+
+        const float* k0 = kernel0;
+        const float* k1 = kernel0 + 3;
+        const float* k2 = kernel0 + 6;
+
+        int i = 0;
+
+        for (; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = bias0;
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+
+                *outptr = sum;
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+                outptr++;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h
new file mode 100644
index 00000000000..48ae66412fc
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h
@@ -0,0 +1,464 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out.row(0);
+        float* outptr1 = out.row(1);
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+        const float* r3 = img0.row(3);
+
+        __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+        __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+        __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+        __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+        __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+        __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+        __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+        __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+        __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+        int i = 0;
+        for (; i + 1 < outh; i += 2)
+        {
+            int j = 0;
+            for (; j + 1 < outw; j += 2)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+                __builtin_prefetch(r3 + 32);
+
+                __m128 _sum00 = _bias0;
+                __m128 _sum01 = _bias0;
+                __m128 _sum10 = _bias0;
+                __m128 _sum11 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01);
+                _sum10 = __lsx_vfmadd_s(_r10, _k00, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r11, _k01, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r12, _k02, _sum10);
+                _sum11 = __lsx_vfmadd_s(_r11, _k00, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r12, _k01, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r13, _k02, _sum11);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01);
+                _sum10 = __lsx_vfmadd_s(_r20, _k10, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r21, _k11, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r22, _k12, _sum10);
+                _sum11 = __lsx_vfmadd_s(_r21, _k10, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r22, _k11, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r23, _k12, _sum11);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+
+                _sum10 = __lsx_vfmadd_s(_r30, _k20, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r31, _k21, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r32, _k22, _sum10);
+                _sum11 = __lsx_vfmadd_s(_r31, _k20, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r32, _k21, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r33, _k22, _sum11);
+
+                __lsx_vst(_sum00, outptr0, 0);
+                __lsx_vst(_sum01, outptr0 + 4, 0);
+                __lsx_vst(_sum10, outptr1, 0);
+                __lsx_vst(_sum11, outptr1 + 4, 0);
+
+                outptr0 += 4 * 2;
+                outptr1 += 4 * 2;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+                r3 += 4 * 2;
+            }
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+                __builtin_prefetch(r3 + 16);
+
+                __m128 _sum0 = _bias0;
+                __m128 _sum1 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1);
+
+                __lsx_vst(_sum0, outptr0, 0);
+                __lsx_vst(_sum1, outptr1, 0);
+
+                outptr0 += 4;
+                outptr1 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+            }
+
+            r0 += 2 * 4 + w * 4;
+            r1 += 2 * 4 + w * 4;
+            r2 += 2 * 4 + w * 4;
+            r3 += 2 * 4 + w * 4;
+
+            outptr0 += outw * 4;
+            outptr1 += outw * 4;
+        }
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 1 < outw; j += 2)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+
+                __m128 _sum00 = _bias0;
+                __m128 _sum01 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01);
+
+                __lsx_vst(_sum00, outptr0, 0);
+                __lsx_vst(_sum01, outptr0 + 4, 0);
+
+                outptr0 += 4 * 2;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+            }
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+            }
+
+            r0 += 2 * 4;
+            r1 += 2 * 4;
+            r2 += 2 * 4;
+        }
+    }
+}
+
+static void convdw3x3s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out;
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+
+        __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+        __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+        __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+        __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+        __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+        __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+        __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+        __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+        __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 1 < outw; j += 2)
+            {
+                __builtin_prefetch(r0 + 64);
+                __builtin_prefetch(r1 + 64);
+                __builtin_prefetch(r2 + 64);
+
+                __m128 _sum00 = _bias0;
+                __m128 _sum01 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r02, _k00, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r03, _k01, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r04, _k02, _sum01);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r12, _k10, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r13, _k11, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r14, _k12, _sum01);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r22, _k20, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r23, _k21, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r24, _k22, _sum01);
+
+                __lsx_vst(_sum00, outptr0, 0);
+                __lsx_vst(_sum01, outptr0 + 4, 0);
+
+                outptr0 += 4 * 2;
+
+                r0 += 4 * 4;
+                r1 += 4 * 4;
+                r2 += 4 * 4;
+            }
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h
new file mode 100644
index 00000000000..4f94c5e6995
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h
@@ -0,0 +1,511 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw5x5s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out.row(0);
+        float* outptr1 = out.row(1);
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+        const float* r3 = img0.row(3);
+        const float* r4 = img0.row(4);
+        const float* r5 = img0.row(5);
+
+        int i = 0;
+        for (; i + 1 < outh; i += 2)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+                __builtin_prefetch(r3 + 16);
+                __builtin_prefetch(r4 + 16);
+                __builtin_prefetch(r5 + 16);
+
+                __builtin_prefetch(k0 + 400);
+
+                __m128 _sum0 = _bias0;
+                __m128 _sum1 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r13, _k03, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r14, _k04, _sum1);
+
+                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r23, _k13, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r24, _k14, _sum1);
+
+                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r33, _k23, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r34, _k24, _sum1);
+
+                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);
+
+                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
+                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
+                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
+                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
+                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r40, _k30, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r41, _k31, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r42, _k32, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r43, _k33, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r44, _k34, _sum1);
+
+                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 -= 4 * 20;
+
+                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);
+
+                __m128 _r50 = (__m128)__lsx_vld(r5, 0);
+                __m128 _r51 = (__m128)__lsx_vld(r5 + 4, 0);
+                __m128 _r52 = (__m128)__lsx_vld(r5 + 4 * 2, 0);
+                __m128 _r53 = (__m128)__lsx_vld(r5 + 4 * 3, 0);
+                __m128 _r54 = (__m128)__lsx_vld(r5 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r50, _k40, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r51, _k41, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r52, _k42, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r53, _k43, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r54, _k44, _sum1);
+
+                __lsx_vst(_sum0, outptr0, 0);
+                __lsx_vst(_sum1, outptr1, 0);
+
+                outptr0 += 4;
+                outptr1 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+                r4 += 4;
+                r5 += 4;
+            }
+
+            r0 += 4 * 4 + w * 4;
+            r1 += 4 * 4 + w * 4;
+            r2 += 4 * 4 + w * 4;
+            r3 += 4 * 4 + w * 4;
+            r4 += 4 * 4 + w * 4;
+            r5 += 4 * 4 + w * 4;
+
+            outptr0 += outw * 4;
+            outptr1 += outw * 4;
+        }
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+                __builtin_prefetch(r3 + 16);
+                __builtin_prefetch(r4 + 16);
+
+                __builtin_prefetch(k0 + 400);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);
+
+                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);
+
+                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
+                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
+                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
+                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
+                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);
+
+                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 -= 4 * 20;
+
+                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+                r4 += 4;
+            }
+
+            r0 += 4 * 4;
+            r1 += 4 * 4;
+            r2 += 4 * 4;
+            r3 += 4 * 4;
+            r4 += 4 * 4;
+        }
+    }
+}
+
+static void convdw5x5s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out;
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+        const float* r3 = img0.row(3);
+        const float* r4 = img0.row(4);
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+                __builtin_prefetch(r3 + 32);
+                __builtin_prefetch(r4 + 32);
+
+                __builtin_prefetch(k0 + 400);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);
+
+                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);
+
+                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
+                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
+                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
+                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
+                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);
+
+                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 -= 4 * 20;
+
+                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+                r3 += 4 * 2;
+                r4 += 4 * 2;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+            r3 += tailstep;
+            r4 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
new file mode 100644
index 00000000000..4d134cc4a39
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
@@ -0,0 +1,966 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolutiondepthwise_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#include "convolutiondepthwise_3x3.h"
+
+#if __loongarch_sx
+#include "convolutiondepthwise_3x3_pack4.h"
+#include "convolutiondepthwise_5x5_pack4.h"
+#endif // __loongarch_sx
+
+ConvolutionDepthWise_loongarch::ConvolutionDepthWise_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+
+    activation = 0;
+}
+
+int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
+{
+    if (dynamic_weight)
+        return 0;
+
+    activation = create_activation_layer(activation_type, activation_params, opt);
+
+#if NCNN_INT8
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8_loongarch(opt);
+    }
+#endif
+
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+#if __loongarch_sx
+        // pack4
+        if (elempack == 4)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data;
+        }
+
+        if (opt.lightmode)
+        {
+            weight_data.release();
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    create_group_ops(opt);
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
+{
+    // create Convolution op for each group
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    for (int i = 0; i < (int)group_ops.size(); i++)
+        delete group_ops[i];
+
+    group_ops.clear();
+
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    group_ops.resize(group);
+
+    for (int g = 0; g < group; g++)
+    {
+        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
+        Mat bias_data_g;
+        if (bias_term)
+            bias_data_g = bias_data.range(num_output_g * g, num_output_g);
+
+        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+
+        // set param
+        ncnn::ParamDict pd;
+        pd.set(0, num_output_g); // num_output
+        pd.set(1, kernel_w);
+        pd.set(11, kernel_h);
+        pd.set(2, dilation_w);
+        pd.set(12, dilation_h);
+        pd.set(3, stride_w);
+        pd.set(13, stride_h);
+        pd.set(4, 0);  // pad_w
+        pd.set(14, 0); // pad_h
+        pd.set(5, bias_term);
+        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
+        pd.set(8, int8_scale_term);
+        pd.set(9, activation_type);
+        pd.set(10, activation_params);
+
+        op->load_param(pd);
+
+        // set weights
+        if (bias_term)
+        {
+            ncnn::Mat weights[5];
+            weights[0] = weight_data_g;
+            weights[1] = bias_data_g;
+
+#if NCNN_INT8
+            if (int8_scale_term)
+            {
+                Mat weight_data_int8_scales_g(num_output_g);
+                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
+                weights[2] = weight_data_int8_scales_g;
+                weights[3] = bottom_blob_int8_scales.range(g, 1);
+            }
+            if (int8_scale_term > 100)
+            {
+                weights[4] = top_blob_int8_scales.range(g, 1);
+            }
+#endif
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+        else
+        {
+            ncnn::Mat weights[4];
+            weights[0] = weight_data_g;
+
+#if NCNN_INT8
+            if (int8_scale_term)
+            {
+                Mat weight_data_int8_scales_g(num_output_g);
+                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
+                weights[1] = weight_data_int8_scales_g;
+                weights[2] = bottom_blob_int8_scales.range(g, 1);
+            }
+            if (int8_scale_term > 100)
+            {
+                weights[3] = top_blob_int8_scales.range(g, 1);
+            }
+#endif
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+
+        op->create_pipeline(opt);
+
+        group_ops[g] = op;
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt)
+{
+    if (activation)
+    {
+        activation->destroy_pipeline(opt);
+        delete activation;
+        activation = 0;
+    }
+
+    for (int i = 0; i < (int)group_ops.size(); i++)
+    {
+        group_ops[i]->destroy_pipeline(opt);
+        delete group_ops[i];
+    }
+    group_ops.clear();
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if NCNN_INT8
+    if (opt.use_int8_inference && int8_scale_term)
+    {
+        return forward_int8_loongarch(bottom_blob, top_blob, opt);
+    }
+#endif
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output)
+    {
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                convdw3x3s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                convdw3x3s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                convdw5x5s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                convdw5x5s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    float* outptr = top_blob.channel(g);
+                    const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                            if (bias_term)
+                            {
+                                _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0);
+                            }
+
+                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                                __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                                _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                            }
+
+                            _sum = activation_ps(_sum, activation_type, activation_params);
+
+                            __lsx_vst(_sum, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                convdw3x3s1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                convdw3x3s2_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < group; g++)
+                {
+                    float* outptr = top_blob.channel(g);
+                    const float* kptr = (const float*)weight_data_tm + maxk * g;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            float sum = 0.f;
+
+                            if (bias_term)
+                                sum = bias_data[g];
+
+                            const float* sptr = m.row(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                float val = (float)sptr[space_ofs[k]];
+                                float w = (float)kptr[k];
+                                sum += val * w;
+                            }
+
+                            sum = activation_ss(sum, activation_type, activation_params);
+
+                            outptr[j] = sum;
+                        }
+
+                        outptr += outw;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    const int channels_g = channels * elempack / group;
+    const int num_output_g = num_output / group;
+
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % 4 == 0 ? 4 : 1;
+        out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    // unpacking
+    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
+    if (elempack > g_elempack)
+    {
+        Option opt_p = opt;
+        opt_p.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
+    }
+
+    Mat top_blob_unpacked = top_blob;
+    if (out_g_elempack < out_elempack)
+    {
+        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
+
+    for (int g = 0; g < group; g++)
+    {
+        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+        const ncnn::Layer* op = group_ops[g];
+
+        Option opt_g = opt;
+        opt_g.blob_allocator = top_blob_unpacked.allocator;
+
+        // forward
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
+    }
+
+    // packing
+    if (out_g_elempack < out_elempack)
+    {
+        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+    }
+    else
+    {
+        top_blob = top_blob_unpacked;
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.c * _weight_data.elempack;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, pad_value);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_flattened.w);
+    pd.set(7, group);
+    pd.set(8, int8_scale_term);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_flattened;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_INT8
+int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % 8 == 0 ? 8 : 1;
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 8)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, 8, opt);
+        }
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data;
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    create_group_ops(opt);
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int elempack = bottom_blob.elempack;
+
+    int elembits = bottom_blob.elembits();
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        const int channels_g = channels * elempack / group;
+
+        Mat scales(channels * elempack);
+        {
+            float* ps = scales;
+            for (int g = 0; g < group; g++)
+            {
+                float scale = bottom_blob_int8_scales[g];
+                for (int q = 0; q < channels_g; q++)
+                {
+                    *ps++ = scale;
+                }
+            }
+        }
+
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+    channels = bottom_blob_bordered.c;
+    elempack = bottom_blob_bordered.elempack;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output)
+    {
+        int out_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        }
+#endif // __loongarch_sx
+        bool use_int8_requantize = int8_scale_term > 100;
+        size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __loongarch_sx
+        if (elempack == 8)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
+                                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                                __m128i _w = __lsx_vld(kptr + k * 8, 0);
+                                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                                __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+                                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                                _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                                _sum1 = __lsx_vadd_w(_sum1, _s0h);
+                            }
+
+                            __m128 _scale_in0;
+                            __m128 _scale_in1;
+                            {
+                                __m128 _bottom_blob_int8_scales0 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8, 0);
+                                __m128 _bottom_blob_int8_scales1 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8 + 4, 0);
+                                __m128 _weight_data_int8_scales0 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8, 0);
+                                __m128 _weight_data_int8_scales1 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8 + 4, 0);
+                                _scale_in0 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
+                                _scale_in1 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales1, _weight_data_int8_scales1));
+
+                                __m128i _m0 = __lsx_vfcmp_cne_s(_weight_data_int8_scales0, __lsx_vreplfr2vr_s(0.f));
+                                __m128i _m1 = __lsx_vfcmp_cne_s(_weight_data_int8_scales1, __lsx_vreplfr2vr_s(0.f));
+                                _scale_in0 = (__m128)__lsx_vand_v((__m128i)_scale_in0, (__m128i)_m0);
+                                _scale_in1 = (__m128)__lsx_vand_v((__m128i)_scale_in1, (__m128i)_m1);
+                            }
+
+                            __m128 _sumfp32_0 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum0), _scale_in0);
+                            __m128 _sumfp32_1 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum1), _scale_in1);
+
+                            if (bias_term)
+                            {
+                                __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + g * 8, 0);
+                                __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + g * 8 + 4, 0);
+                                _sumfp32_0 = __lsx_vfadd_s(_sumfp32_0, _bias0);
+                                _sumfp32_1 = __lsx_vfadd_s(_sumfp32_1, _bias1);
+                            }
+
+                            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
+                            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize and relu
+                                __m128 _scale_out0 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8, 0);
+                                __m128 _scale_out1 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8 + 4, 0);
+                                _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_out0);
+                                _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_out1);
+                                int64_t _sum8 = float2int8(_sumfp32_0, _sumfp32_1);
+
+                                *(int64_t*)outptr_s8 = _sum8;
+                                outptr_s8 += 8;
+                            }
+                            else
+                            {
+                                // dequantize and relu
+                                __lsx_vst(_sumfp32_0, outptr_f32, 0);
+                                __lsx_vst(_sumfp32_1, outptr_f32 + 4, 0);
+                                outptr_f32 += 8;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < group; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sum = 0;
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                signed char val = sptr[space_ofs[k]];
+                                signed char w = kptr[k];
+                                sum += val * w;
+                            }
+
+                            float scale_in;
+                            if (weight_data_int8_scales[g] == 0)
+                                scale_in = 0;
+                            else
+                                scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+
+                            float sumfp32 = sum * scale_in;
+
+                            if (bias_term)
+                                sumfp32 += bias_data[g];
+
+                            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize
+                                float scale_out = top_blob_int8_scales[g];
+                                signed char sums8 = float2int8(sumfp32 * scale_out);
+                                outptr_s8[0] = sums8;
+                                outptr_s8 += 1;
+                            }
+                            else
+                            {
+                                // dequantize
+                                outptr_f32[0] = sumfp32;
+                                outptr_f32 += 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        else
+            out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // group convolution
+    const int channels_g = channels * elempack / group;
+    const int num_output_g = num_output / group;
+
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % 8 == 0 ? 8 : 1;
+        if (use_int8_requantize)
+            out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
+        else
+            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+    // unpacking
+    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
+    if (elempack > g_elempack)
+    {
+        Option opt_p = opt;
+        opt_p.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
+    }
+
+    Mat top_blob_unpacked = top_blob;
+    if (out_g_elempack < out_elempack)
+    {
+        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+        const ncnn::Layer* op = group_ops[g];
+
+        Option opt_g = opt;
+        opt_g.blob_allocator = top_blob_unpacked.allocator;
+
+        // forward
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
+    }
+
+    // packing
+    if (out_g_elempack < out_elempack)
+    {
+        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+    }
+    else
+    {
+        top_blob = top_blob_unpacked;
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.h b/src/layer/loongarch/convolutiondepthwise_loongarch.h
new file mode 100644
index 00000000000..554fe764304
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.h
@@ -0,0 +1,50 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H
+#define LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H
+
+#include "convolutiondepthwise.h"
+
+namespace ncnn {
+
+class ConvolutionDepthWise_loongarch : virtual public ConvolutionDepthWise
+{
+public:
+    ConvolutionDepthWise_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+protected:
+    int create_group_ops(const Option& opt);
+#if NCNN_INT8
+    int create_pipeline_int8_loongarch(const Option& opt);
+    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
+public:
+    Layer* activation;
+    std::vector<ncnn::Layer*> group_ops;
+
+    Mat weight_data_tm;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H
diff --git a/src/layer/loongarch/crop_loongarch.cpp b/src/layer/loongarch/crop_loongarch.cpp
new file mode 100644
index 00000000000..e7c588bc476
--- /dev/null
+++ b/src/layer/loongarch/crop_loongarch.cpp
@@ -0,0 +1,399 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "crop_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Crop_loongarch::Crop_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+#if __loongarch_sx
+static void crop_pack4_lsx(const Mat& src, Mat& dst, int top, int left)
+{
+    int w = dst.w;
+    int h = dst.h;
+    int right = src.w - dst.w - left;
+
+    const float* ptr = src.row(top) + left * 4;
+    float* outptr = dst;
+
+    for (int y = 0; y < h; y++)
+    {
+        for (int x = 0; x < w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __lsx_vst(_p, outptr, 0);
+
+            ptr += 4;
+            outptr += 4;
+        }
+
+        ptr += (left + right) * 4;
+    }
+}
+#endif // __loongarch_sx
+
+int Crop_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        int _woffset, _hoffset, _doffset, _coffset;
+        int _outw, _outh, _outd, _outc;
+        resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
+
+        if (dims == 1)
+        {
+            int out_elempack = _outw % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw / out_elempack == w && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_woffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int out_elempack = _outh % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_hoffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    const Mat m = bottom_blob_sliced.channel(q);
+                    Mat borderm = top_blob.channel(q);
+
+                    crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h && _outd == d)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    for (int z = 0; z < _outd; z++)
+                    {
+                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    return Crop::forward(bottom_blob_unpacked, top_blob, opt);
+}
+
+int Crop_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int ref_elempack = reference_blob.elempack;
+
+    Mat& top_blob = top_blobs[0];
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        int _woffset, _hoffset, _doffset, _coffset;
+        int _outw, _outh, _outd, _outc;
+        if (woffset == -233)
+        {
+            resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
+        }
+        else
+        {
+            resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
+        }
+
+        if (dims == 1)
+        {
+            int out_elempack = _outw % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw / out_elempack == w && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_woffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int out_elempack = _outh % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_hoffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    const Mat m = bottom_blob_sliced.channel(q);
+                    Mat borderm = top_blob.channel(q);
+
+                    crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h && _outd == d)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    for (int z = 0; z < _outd; z++)
+                    {
+                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    Mat reference_blob_unpacked = reference_blob;
+    if (ref_elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1);
+    }
+
+    std::vector<Mat> bottom_blobs_unpacked(2);
+    bottom_blobs_unpacked[0] = bottom_blob_unpacked;
+    bottom_blobs_unpacked[1] = reference_blob_unpacked;
+
+    return Crop::forward(bottom_blobs_unpacked, top_blobs, opt);
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/crop_loongarch.h b/src/layer/loongarch/crop_loongarch.h
new file mode 100644
index 00000000000..0ba460256d6
--- /dev/null
+++ b/src/layer/loongarch/crop_loongarch.h
@@ -0,0 +1,34 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CROP_LOONGARCH_H
+#define LAYER_CROP_LOONGARCH_H
+
+#include "crop.h"
+
+namespace ncnn {
+
+class Crop_loongarch : virtual public Crop
+{
+public:
+    Crop_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CROP_LOONGARCH_H
diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp
new file mode 100644
index 00000000000..bb913909b55
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -0,0 +1,284 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "deconvolution_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#if __loongarch_sx
+#include "deconvolution_pack4.h"
+#include "deconvolution_pack1to4.h"
+#include "deconvolution_pack4to1.h"
+#endif // __loongarch_sx
+
+Deconvolution_loongarch::Deconvolution_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Deconvolution_loongarch::create_pipeline(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    int num_input = weight_data_size / maxk / num_output;
+
+    Mat weight_data_transposed(weight_data.w);
+    {
+        float* pt = weight_data_transposed;
+        const float* p = weight_data;
+
+        for (int i = 0; i < num_input * num_output; i++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                pt[maxk - 1 - k] = p[k];
+            }
+
+            p += maxk;
+            pt += maxk;
+        }
+    }
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-kw-kh-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
+
+        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_tm.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < elempack; i++)
+                    {
+                        for (int j = 0; j < out_elempack; j++)
+                        {
+                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+#if __loongarch_sx
+    // pack4
+    if (elempack == 4 && out_elempack == 4)
+    {
+    }
+
+    // pack1ton
+    if (elempack == 1 && out_elempack == 4)
+    {
+    }
+
+    // pack4to1
+    if (elempack == 4 && out_elempack == 1)
+    {
+    }
+#endif // __loongarch_sx
+
+    // pack1
+    if (elempack == 1 && out_elempack == 1)
+    {
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Deconvolution_loongarch::destroy_pipeline(const Option& opt)
+{
+    return 0;
+}
+
+int Deconvolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // deconvolv with NxN kernel
+    // value = value + bias
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
+    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    Mat top_blob_bordered;
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
+    {
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    }
+    if (top_blob_bordered.empty())
+        return -100;
+
+    const int maxk = kernel_w * kernel_h;
+
+#if __loongarch_sx
+    if (elempack == 4 && out_elempack == 4)
+    {
+        {
+            deconvolution_pack4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        {
+            deconvolution_pack1to4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 1)
+    {
+        {
+            deconvolution_pack4to1_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        {
+            // num_output
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < num_output; p++)
+            {
+                float* outptr = top_blob_bordered.channel(p);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        float sum = 0.f;
+
+                        if (bias_term)
+                        {
+                            sum = bias_data[p];
+                        }
+
+                        const float* kptr = (const float*)weight_data_tm.channel(p);
+
+                        // channels
+                        for (int q = 0; q < channels; q++)
+                        {
+                            const Mat m = bottom_blob.channel(q);
+
+                            for (int y = 0; y < kernel_h; y++)
+                            {
+                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                                if (sys < 0 || sys % stride_h != 0)
+                                    continue;
+
+                                int sy = sys / stride_h;
+                                if (sy >= h)
+                                    continue;
+
+                                const float* sptr = m.row(sy);
+
+                                for (int x = 0; x < kernel_w; x++)
+                                {
+                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                                    if (sxs < 0 || sxs % stride_w != 0)
+                                        continue;
+
+                                    int sx = sxs / stride_w;
+                                    if (sx >= w)
+                                        continue;
+
+                                    float val = sptr[sx];
+
+                                    int k = y * kernel_w + x;
+
+                                    float w = kptr[k];
+
+                                    sum += val * w;
+                                }
+                            }
+
+                            kptr += maxk;
+                        }
+
+                        sum = activation_ss(sum, activation_type, activation_params);
+
+                        outptr[j] = sum;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+    }
+
+    cut_padding(top_blob_bordered, top_blob, opt);
+    if (top_blob.empty())
+        return -100;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h
new file mode 100644
index 00000000000..bb7653b563f
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_loongarch.h
@@ -0,0 +1,38 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DECONVOLUTION_LOONGARCH_H
+#define LAYER_DECONVOLUTION_LOONGARCH_H
+
+#include "deconvolution.h"
+
+namespace ncnn {
+
+class Deconvolution_loongarch : virtual public Deconvolution
+{
+public:
+    Deconvolution_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+public:
+    Mat weight_data_tm;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DECONVOLUTION_LOONGARCH_H
diff --git a/src/layer/loongarch/deconvolution_pack1to4.h b/src/layer/loongarch/deconvolution_pack1to4.h
new file mode 100644
index 00000000000..ee1f932b57a
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_pack1to4.h
@@ -0,0 +1,99 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deconvolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+
+                    for (int y = 0; y < kernel_h; y++)
+                    {
+                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                        if (sys < 0 || sys % stride_h != 0)
+                            continue;
+
+                        int sy = sys / stride_h;
+                        if (sy >= h)
+                            continue;
+
+                        const float* sptr = m.row(sy);
+
+                        for (int x = 0; x < kernel_w; x++)
+                        {
+                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                            if (sxs < 0 || sxs % stride_w != 0)
+                                continue;
+
+                            int sx = sxs / stride_w;
+                            if (sx >= w)
+                                continue;
+
+                            float val = sptr[sx];
+
+                            int k = y * kernel_w + x;
+
+                            __m128 _val = (__m128)__lsx_vreplfr2vr_s(val);
+                            __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                        }
+                    }
+
+                    kptr += maxk * 4;
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/deconvolution_pack4.h b/src/layer/loongarch/deconvolution_pack4.h
new file mode 100644
index 00000000000..179a410350f
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_pack4.h
@@ -0,0 +1,106 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deconvolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack4.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+
+                    for (int y = 0; y < kernel_h; y++)
+                    {
+                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                        if (sys < 0 || sys % stride_h != 0)
+                            continue;
+
+                        int sy = sys / stride_h;
+                        if (sy >= h)
+                            continue;
+
+                        for (int x = 0; x < kernel_w; x++)
+                        {
+                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                            if (sxs < 0 || sxs % stride_w != 0)
+                                continue;
+
+                            int sx = sxs / stride_w;
+                            if (sx >= w)
+                                continue;
+
+                            const float* sptr = m.row(sy) + sx * 4;
+
+                            int k = (y * kernel_w + x) * 16;
+
+                            __m128 _val0 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _val1 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _val2 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _val3 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _w0 = (__m128)__lsx_vld(kptr + k, 0);
+                            __m128 _w1 = (__m128)__lsx_vld(kptr + k + 4, 0);
+                            __m128 _w2 = (__m128)__lsx_vld(kptr + k + 8, 0);
+                            __m128 _w3 = (__m128)__lsx_vld(kptr + k + 12, 0);
+                            _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+                            _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
+                            _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
+                            _sum = __lsx_vfmadd_s(_w3, _val3, _sum);
+                        }
+                    }
+
+                    kptr += maxk * 16;
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/deconvolution_pack4to1.h b/src/layer/loongarch/deconvolution_pack4to1.h
new file mode 100644
index 00000000000..e13721c2c35
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_pack4to1.h
@@ -0,0 +1,101 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deconvolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = 0.f;
+
+                if (bias_data_ptr)
+                {
+                    sum = bias_data_ptr[p];
+                }
+
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+
+                    for (int y = 0; y < kernel_h; y++)
+                    {
+                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                        if (sys < 0 || sys % stride_h != 0)
+                            continue;
+
+                        int sy = sys / stride_h;
+                        if (sy >= h)
+                            continue;
+
+                        for (int x = 0; x < kernel_w; x++)
+                        {
+                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                            if (sxs < 0 || sxs % stride_w != 0)
+                                continue;
+
+                            int sx = sxs / stride_w;
+                            if (sx >= w)
+                                continue;
+
+                            const float* sptr = m.row(sy) + sx * 4;
+
+                            int k = y * kernel_w + x;
+
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                        }
+                    }
+
+                    kptr += maxk * 4;
+                }
+
+                sum += __lsx_reduce_fadd_s(_sum);
+
+                sum = activation_ss(sum, activation_type, activation_params);
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
new file mode 100644
index 00000000000..a141dd70360
--- /dev/null
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -0,0 +1,412 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "deconvolutiondepthwise_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+DeconvolutionDepthWise_loongarch::DeconvolutionDepthWise_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        Mat weight_data_transposed(weight_data.w);
+        {
+            float* pt = weight_data_transposed;
+            const float* p = weight_data;
+
+            for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    pt[maxk - 1 - k] = p[k];
+                }
+
+                p += maxk;
+                pt += maxk;
+            }
+        }
+
+#if __loongarch_sx
+        // pack4
+        if (elempack == 4)
+        {
+            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data_transposed;
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    create_group_ops(opt);
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
+{
+    // create Deconvolution op for each group
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    for (int i = 0; i < (int)group_ops.size(); i++)
+        delete group_ops[i];
+
+    group_ops.clear();
+
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    group_ops.resize(group);
+
+    for (int g = 0; g < group; g++)
+    {
+        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
+        Mat bias_data_g;
+        if (bias_term)
+            bias_data_g = bias_data.range(num_output_g * g, num_output_g);
+
+        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+
+        // set param
+        ncnn::ParamDict pd;
+        pd.set(0, num_output_g); // num_output
+        pd.set(1, kernel_w);
+        pd.set(11, kernel_h);
+        pd.set(2, dilation_w);
+        pd.set(12, dilation_h);
+        pd.set(3, stride_w);
+        pd.set(13, stride_h);
+        pd.set(4, 0);  // pad_w
+        pd.set(14, 0); // pad_h
+        pd.set(18, output_pad_right);
+        pd.set(19, output_pad_bottom);
+        pd.set(5, bias_term);
+        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
+        pd.set(9, activation_type);
+        pd.set(10, activation_params);
+
+        op->load_param(pd);
+
+        // set weights
+        if (bias_term)
+        {
+            ncnn::Mat weights[2];
+            weights[0] = weight_data_g;
+            weights[1] = bias_data_g;
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+        else
+        {
+            ncnn::Mat weights[1];
+            weights[0] = weight_data_g;
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+
+        op->create_pipeline(opt);
+
+        group_ops[g] = op;
+    }
+
+    return 0;
+}
+
+int DeconvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt)
+{
+    for (int i = 0; i < (int)group_ops.size(); i++)
+    {
+        group_ops[i]->destroy_pipeline(opt);
+        delete group_ops[i];
+    }
+    group_ops.clear();
+
+    return 0;
+}
+
+int DeconvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // convolv with NxN kernel
+    // value = value + bias
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
+    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    Mat top_blob_bordered;
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
+    {
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    }
+    if (top_blob_bordered.empty())
+        return -100;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output)
+    {
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    float* outptr = top_blob_bordered.channel(g);
+                    const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
+                    const Mat m = bottom_blob.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                            if (bias_term)
+                            {
+                                _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0);
+                            }
+
+                            for (int y = 0; y < kernel_h; y++)
+                            {
+                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                                if (sys < 0 || sys % stride_h != 0)
+                                    continue;
+
+                                int sy = sys / stride_h;
+                                if (sy >= h)
+                                    continue;
+
+                                for (int x = 0; x < kernel_w; x++)
+                                {
+                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                                    if (sxs < 0 || sxs % stride_w != 0)
+                                        continue;
+
+                                    int sx = sxs / stride_w;
+                                    if (sx >= w)
+                                        continue;
+
+                                    const float* sptr = m.row(sy) + sx * 4;
+
+                                    int k = y * kernel_w + x;
+
+                                    __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                                    __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                                    _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                                }
+                            }
+
+                            _sum = activation_ps(_sum, activation_type, activation_params);
+
+                            __lsx_vst(_sum, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int g = 0; g < channels; g++)
+            {
+                float* outptr = top_blob_bordered.channel(g);
+                const float* kptr = (const float*)weight_data_tm + maxk * g;
+                const Mat m = bottom_blob.channel(g);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        float sum = 0.f;
+
+                        if (bias_term)
+                        {
+                            sum = bias_data[g];
+                        }
+
+                        for (int y = 0; y < kernel_h; y++)
+                        {
+                            int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                            if (sys < 0 || sys % stride_h != 0)
+                                continue;
+
+                            int sy = sys / stride_h;
+                            if (sy >= h)
+                                continue;
+
+                            const float* sptr = m.row(sy);
+
+                            for (int x = 0; x < kernel_w; x++)
+                            {
+                                int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                                if (sxs < 0 || sxs % stride_w != 0)
+                                    continue;
+
+                                int sx = sxs / stride_w;
+                                if (sx >= w)
+                                    continue;
+
+                                float val = sptr[sx];
+
+                                int k = y * kernel_w + x;
+
+                                float w = kptr[k];
+
+                                sum += val * w;
+                            }
+                        }
+
+                        sum = activation_ss(sum, activation_type, activation_params);
+
+                        outptr[j] = sum;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+    }
+    else
+    {
+        // group deconvolution
+        const int channels_g = channels * elempack / group;
+        const int num_output_g = num_output / group;
+
+        int g_elempack = 1;
+        int out_g_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            g_elempack = channels_g % 4 == 0 ? 4 : 1;
+            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        // unpacking
+        Mat bottom_blob_unpacked = bottom_blob;
+        if (elempack > g_elempack)
+        {
+            Option opt_p = opt;
+            opt_p.blob_allocator = opt.workspace_allocator;
+            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
+        }
+
+        Mat top_blob_bordered_unpacked = top_blob_bordered;
+        if (out_g_elempack < out_elempack)
+        {
+            top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
+            if (top_blob_bordered_unpacked.empty())
+                return -100;
+        }
+
+        for (int g = 0; g < group; g++)
+        {
+            const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+            Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+            const ncnn::Layer* op = group_ops[g];
+
+            Option opt_g = opt;
+            opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
+
+            // forward
+            op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
+        }
+
+        // packing
+        if (out_g_elempack < out_elempack)
+        {
+            convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
+        }
+        else
+        {
+            top_blob_bordered = top_blob_bordered_unpacked;
+        }
+    }
+
+    cut_padding(top_blob_bordered, top_blob, opt);
+    if (top_blob.empty())
+        return -100;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
new file mode 100644
index 00000000000..e41e7cac9e1
--- /dev/null
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
@@ -0,0 +1,43 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H
+#define LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H
+
+#include "deconvolutiondepthwise.h"
+
+namespace ncnn {
+
+class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise
+{
+public:
+    DeconvolutionDepthWise_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int create_group_ops(const Option& opt);
+
+public:
+    std::vector<ncnn::Layer*> group_ops;
+
+    Mat weight_data_tm;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H
diff --git a/src/layer/loongarch/dequantize_loongarch.cpp b/src/layer/loongarch/dequantize_loongarch.cpp
new file mode 100644
index 00000000000..5ee9595f89f
--- /dev/null
+++ b/src/layer/loongarch/dequantize_loongarch.cpp
@@ -0,0 +1,838 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dequantize_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Dequantize_loongarch::Dequantize_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Dequantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // assert bottom_blob.elembits() == 32
+
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int outw = w * 2;
+
+            top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int outh = h * 2;
+
+            top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int outc = channels * 2;
+
+            top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 64);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                        __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale1);
+                        _v2 = __lsx_vfmul_s(_v2, _scale0);
+                        _v3 = __lsx_vfmul_s(_v3, _scale1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v2, ptr0 + 4, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+                        __lsx_vst(_v3, ptr1 + 4, 0);
+
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 64);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                        __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                        _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                        _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                        _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v2, ptr0 + 4, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+                        __lsx_vst(_v3, ptr1 + 4, 0);
+
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                    __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale);
+                        _v1 = __lsx_vfmul_s(_v1, _scale);
+                        __lsx_vst(_v0, ptr, 0);
+                        __lsx_vst(_v1, ptr + 4, 0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0);
+                    __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale, _v1, _bias);
+                        __lsx_vst(_v0, ptr, 0);
+                        __lsx_vst(_v1, ptr + 4, 0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        float* ptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale;
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias_data[i];
+                }
+            }
+        }
+        else
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i];
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias_data[i];
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+                int j = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                for (; j + 3 < w; j += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    *ptr++ = *intptr++ * scale;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                int j = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias);
+                for (; j + 3 < w; j += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    *ptr++ = *intptr++ * scale + bias;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+                int i = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                for (; i + 7 < size; i += 8)
+                {
+                    __builtin_prefetch(intptr + 32);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                    _v0 = __lsx_vfmul_s(_v0, _scale);
+                    _v1 = __lsx_vfmul_s(_v1, _scale);
+                    __lsx_vst(_v0, ptr, 0);
+                    __lsx_vst(_v1, ptr + 4, 0);
+
+                    intptr += 8;
+                    ptr += 8;
+                }
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *ptr++ = *intptr++ * scale;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                int i = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias);
+                for (; i + 7 < size; i += 8)
+                {
+                    __builtin_prefetch(intptr + 32);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                    _v0 = __lsx_vfmadd_s(_scale, _v0, _bias);
+                    _v1 = __lsx_vfmadd_s(_scale, _v1, _bias);
+                    __lsx_vst(_v0, ptr, 0);
+                    __lsx_vst(_v1, ptr + 4, 0);
+
+                    intptr += 8;
+                    ptr += 8;
+                }
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *ptr++ = *intptr++ * scale + bias;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/dequantize_loongarch.h b/src/layer/loongarch/dequantize_loongarch.h
new file mode 100644
index 00000000000..61a408d5c50
--- /dev/null
+++ b/src/layer/loongarch/dequantize_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DEQUANTIZE_LOONGARCH_H
+#define LAYER_DEQUANTIZE_LOONGARCH_H
+
+#include "dequantize.h"
+
+namespace ncnn {
+
+class Dequantize_loongarch : virtual public Dequantize
+{
+public:
+    Dequantize_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DEQUANTIZE_LOONGARCH_H
diff --git a/src/layer/loongarch/dropout_loongarch.cpp b/src/layer/loongarch/dropout_loongarch.cpp
new file mode 100644
index 00000000000..04a1f9ea95d
--- /dev/null
+++ b/src/layer/loongarch/dropout_loongarch.cpp
@@ -0,0 +1,75 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dropout_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Dropout_loongarch::Dropout_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Dropout_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    if (scale == 1.f)
+    {
+        return 0;
+    }
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmul_s(_p, _scale);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr * scale;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/dropout_loongarch.h b/src/layer/loongarch/dropout_loongarch.h
new file mode 100644
index 00000000000..42810050677
--- /dev/null
+++ b/src/layer/loongarch/dropout_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DROPOUT_LOONGARCH_H
+#define LAYER_DROPOUT_LOONGARCH_H
+
+#include "dropout.h"
+
+namespace ncnn {
+
+class Dropout_loongarch : virtual public Dropout
+{
+public:
+    Dropout_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DROPOUT_LOONGARCH_H
diff --git a/src/layer/loongarch/eltwise_loongarch.cpp b/src/layer/loongarch/eltwise_loongarch.cpp
new file mode 100644
index 00000000000..d803fc3db78
--- /dev/null
+++ b/src/layer/loongarch/eltwise_loongarch.cpp
@@ -0,0 +1,332 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "eltwise_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Eltwise_loongarch::Eltwise_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Eltwise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int elempack = bottom_blob.elempack;
+    int size = w * h * elempack;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create_like(bottom_blob, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (op_type == Operation_PROD)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                _p = __lsx_vfmul_s(_p, _p1);
+                __lsx_vst(_p, outptr, 0);
+
+                ptr += 4;
+                ptr1 += 4;
+                outptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = *ptr * *ptr1;
+
+                ptr++;
+                ptr1++;
+                outptr++;
+            }
+        }
+
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                    _p = __lsx_vfmul_s(_p, _p1);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr *= *ptr;
+
+                    ptr++;
+                    outptr++;
+                }
+            }
+        }
+    }
+    if (op_type == Operation_SUM)
+    {
+        if (coeffs.w == 0)
+        {
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    _p = __lsx_vfadd_s(_p, _p1);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr = *ptr + *ptr1;
+
+                    ptr++;
+                    ptr1++;
+                    outptr++;
+                }
+            }
+
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    int i = 0;
+#if __loongarch_sx
+                    for (; i + 3 < size; i += 4)
+                    {
+                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                        _p = __lsx_vfadd_s(_p, _p1);
+                        __lsx_vst(_p, outptr, 0);
+
+                        ptr += 4;
+                        outptr += 4;
+                    }
+#endif // __loongarch_sx
+                    for (; i < size; i++)
+                    {
+                        *outptr += *ptr;
+
+                        ptr++;
+                        outptr++;
+                    }
+                }
+            }
+        }
+        else
+        {
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            float coeff0 = coeffs[0];
+            float coeff1 = coeffs[1];
+#if __loongarch_sx
+            __m128 _coeff0 = (__m128)__lsx_vreplfr2vr_s(coeff0);
+            __m128 _coeff1 = (__m128)__lsx_vreplfr2vr_s(coeff1);
+#endif // __loongarch_sx
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    _p = __lsx_vfmul_s(_p, _coeff0);
+                    _p = __lsx_vfmadd_s(_coeff1, _p1, _p);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;
+
+                    ptr++;
+                    ptr1++;
+                    outptr++;
+                }
+            }
+
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                float coeff = coeffs[b];
+#if __loongarch_sx
+                __m128 _coeff = (__m128)__lsx_vreplfr2vr_s(coeff);
+#endif // __loongarch_sx
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    int i = 0;
+#if __loongarch_sx
+                    for (; i + 3 < size; i += 4)
+                    {
+                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                        _p = __lsx_vfmadd_s(_coeff, _p1, _p);
+                        __lsx_vst(_p, outptr, 0);
+
+                        ptr += 4;
+                        outptr += 4;
+                    }
+#endif // __loongarch_sx
+                    for (; i < size; i++)
+                    {
+                        *outptr += *ptr * coeff;
+
+                        ptr++;
+                        outptr++;
+                    }
+                }
+            }
+        }
+    }
+    if (op_type == Operation_MAX)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                _p = __lsx_vfmax_s(_p, _p1);
+                __lsx_vst(_p, outptr, 0);
+
+                ptr += 4;
+                ptr1 += 4;
+                outptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = std::max(*ptr, *ptr1);
+
+                ptr++;
+                ptr1++;
+                outptr++;
+            }
+        }
+
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                    _p = __lsx_vfmax_s(_p, _p1);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr = std::max(*ptr, *outptr);
+
+                    ptr++;
+                    outptr++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/eltwise_loongarch.h b/src/layer/loongarch/eltwise_loongarch.h
new file mode 100644
index 00000000000..f9715b20cad
--- /dev/null
+++ b/src/layer/loongarch/eltwise_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ELTWISE_LOONGARCH_H
+#define LAYER_ELTWISE_LOONGARCH_H
+
+#include "eltwise.h"
+
+namespace ncnn {
+
+class Eltwise_loongarch : virtual public Eltwise
+{
+public:
+    Eltwise_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELTWISE_LOONGARCH_H
diff --git a/src/layer/loongarch/flatten_loongarch.cpp b/src/layer/loongarch/flatten_loongarch.cpp
new file mode 100644
index 00000000000..6d9a8636287
--- /dev/null
+++ b/src/layer/loongarch/flatten_loongarch.cpp
@@ -0,0 +1,370 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "flatten_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Flatten_loongarch::Flatten_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Flatten_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    if (elembits == 8)
+        return forward_int8(bottom_blob, top_blob, opt);
+
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    int size = w * h * d;
+
+    int total = size * channels * elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = total % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (out_elempack == 1)
+    {
+        return Flatten::forward(bottom_blob, top_blob, opt);
+    }
+
+    if (dims == 2 && elempack == 1) // out_elempack == 4
+    {
+        top_blob = bottom_blob;
+        top_blob.dims = 1;
+        top_blob.w = total / out_elempack;
+        top_blob.h = 1;
+        top_blob.cstep = top_blob.w;
+        top_blob.elemsize = out_elemsize;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (dims == 2)
+    {
+#if __loongarch_sx
+        if (elempack == 4) // out_elempack == 4
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const float* ptr = bottom_blob.row(i);
+                float* outptr0 = (float*)top_blob + w * i * 4;
+                float* outptr1 = (float*)top_blob + w * (i * 4 + 1);
+                float* outptr2 = (float*)top_blob + w * (i * 4 + 2);
+                float* outptr3 = (float*)top_blob + w * (i * 4 + 3);
+
+                int j = 0;
+                for (; j + 3 < w; j += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(ptr, 0);
+                    __m128i _r1 = __lsx_vld(ptr + 4, 0);
+                    __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    ptr += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+                for (; j < w; j++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+
+                    ptr += 4;
+                }
+            }
+        }
+#endif // __loongarch_sx
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+#if __loongarch_sx
+        if (elempack == 4) // out_elempack == 4
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr0 = (float*)top_blob + size * q * 4;
+                float* outptr1 = (float*)top_blob + size * (q * 4 + 1);
+                float* outptr2 = (float*)top_blob + size * (q * 4 + 2);
+                float* outptr3 = (float*)top_blob + size * (q * 4 + 3);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(ptr, 0);
+                    __m128i _r1 = __lsx_vld(ptr + 4, 0);
+                    __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    ptr += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+                for (; i < size; i++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+
+                    ptr += 4;
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1) // out_elempack == 4
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = (float*)top_blob + size * q;
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __lsx_vst(__lsx_vld(ptr, 0), outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr++ = *ptr++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Flatten_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    int size = w * h * d;
+
+    int total = size * channels * elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = total % 8 == 0 ? 8 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (out_elempack == 1)
+    {
+        return Flatten::forward(bottom_blob, top_blob, opt);
+    }
+
+    if (dims == 2 && elempack == 1) // out_elempack == 8
+    {
+        top_blob = bottom_blob;
+        top_blob.dims = 1;
+        top_blob.w = total / out_elempack;
+        top_blob.h = 1;
+        top_blob.cstep = top_blob.w;
+        top_blob.elemsize = out_elemsize;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (dims == 2)
+    {
+#if __loongarch_sx
+        if (elempack == 8) // out_elempack == 8
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const signed char* ptr = bottom_blob.row<signed char>(i);
+                signed char* outptr0 = (signed char*)top_blob + w * i * 8;
+                signed char* outptr1 = (signed char*)top_blob + w * (i * 8 + 1);
+                signed char* outptr2 = (signed char*)top_blob + w * (i * 8 + 2);
+                signed char* outptr3 = (signed char*)top_blob + w * (i * 8 + 3);
+                signed char* outptr4 = (signed char*)top_blob + w * (i * 8 + 4);
+                signed char* outptr5 = (signed char*)top_blob + w * (i * 8 + 5);
+                signed char* outptr6 = (signed char*)top_blob + w * (i * 8 + 6);
+                signed char* outptr7 = (signed char*)top_blob + w * (i * 8 + 7);
+
+                int j = 0;
+                for (; j < w; j++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+                    *outptr4++ = ptr[4];
+                    *outptr5++ = ptr[5];
+                    *outptr6++ = ptr[6];
+                    *outptr7++ = ptr[7];
+
+                    ptr += 8;
+                }
+            }
+        }
+#endif // __loongarch_sx
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+#if __loongarch_sx
+        if (elempack == 8) // out_elempack == 8
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const signed char* ptr = bottom_blob.channel(q);
+                signed char* outptr0 = (signed char*)top_blob + size * q * 8;
+                signed char* outptr1 = (signed char*)top_blob + size * (q * 8 + 1);
+                signed char* outptr2 = (signed char*)top_blob + size * (q * 8 + 2);
+                signed char* outptr3 = (signed char*)top_blob + size * (q * 8 + 3);
+                signed char* outptr4 = (signed char*)top_blob + size * (q * 8 + 4);
+                signed char* outptr5 = (signed char*)top_blob + size * (q * 8 + 5);
+                signed char* outptr6 = (signed char*)top_blob + size * (q * 8 + 6);
+                signed char* outptr7 = (signed char*)top_blob + size * (q * 8 + 7);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+                    *outptr4++ = ptr[4];
+                    *outptr5++ = ptr[5];
+                    *outptr6++ = ptr[6];
+                    *outptr7++ = ptr[7];
+
+                    ptr += 8;
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1) // out_elempack == 8
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const signed char* ptr = bottom_blob.channel(q);
+                signed char* outptr = (signed char*)top_blob + size * q;
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    *outptr++ = *ptr++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/flatten_loongarch.h b/src/layer/loongarch/flatten_loongarch.h
new file mode 100644
index 00000000000..afd35c701f5
--- /dev/null
+++ b/src/layer/loongarch/flatten_loongarch.h
@@ -0,0 +1,35 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_FLATTEN_LOONGARCH_H
+#define LAYER_FLATTEN_LOONGARCH_H
+
+#include "flatten.h"
+
+namespace ncnn {
+
+class Flatten_loongarch : virtual public Flatten
+{
+public:
+    Flatten_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_FLATTEN_LOONGARCH_H
diff --git a/src/layer/loongarch/hardsigmoid_loongarch.cpp b/src/layer/loongarch/hardsigmoid_loongarch.cpp
new file mode 100644
index 00000000000..9dfedb689bc
--- /dev/null
+++ b/src/layer/loongarch/hardsigmoid_loongarch.cpp
@@ -0,0 +1,79 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "hardsigmoid_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+HardSigmoid_loongarch::HardSigmoid_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int HardSigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
+        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmadd_s(_alpha, _p, _beta);
+            _p = __lsx_vfmax_s(_p, _zero);
+            _p = __lsx_vfmin_s(_p, _one);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < lower)
+                *ptr = 0.f;
+            else if (*ptr > upper)
+                *ptr = 1.f;
+            else
+                *ptr = *ptr * alpha + beta;
+            ++ptr;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/hardsigmoid_loongarch.h b/src/layer/loongarch/hardsigmoid_loongarch.h
new file mode 100644
index 00000000000..755ae89ff03
--- /dev/null
+++ b/src/layer/loongarch/hardsigmoid_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_HARDSIGMOID_LOONGARCH_H
+#define LAYER_HARDSIGMOID_LOONGARCH_H
+
+#include "hardsigmoid.h"
+
+namespace ncnn {
+
+class HardSigmoid_loongarch : virtual public HardSigmoid
+{
+public:
+    HardSigmoid_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_HARDSIGMOID_LOONGARCH_H
diff --git a/src/layer/loongarch/hardswish_loongarch.cpp b/src/layer/loongarch/hardswish_loongarch.cpp
new file mode 100644
index 00000000000..f1417a7986c
--- /dev/null
+++ b/src/layer/loongarch/hardswish_loongarch.cpp
@@ -0,0 +1,80 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "hardswish_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+HardSwish_loongarch::HardSwish_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int HardSwish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
+        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = __lsx_vfmadd_s(_alpha, _p, _beta);
+            _outp = __lsx_vfmax_s(_outp, _zero);
+            _outp = __lsx_vfmin_s(_outp, _one);
+            _outp = __lsx_vfmul_s(_outp, _p);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < lower)
+                *ptr = 0.f;
+            else if (*ptr > upper)
+                ;
+            else
+                *ptr = *ptr * (*ptr * alpha + beta);
+            ++ptr;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/hardswish_loongarch.h b/src/layer/loongarch/hardswish_loongarch.h
new file mode 100644
index 00000000000..e9b0821245c
--- /dev/null
+++ b/src/layer/loongarch/hardswish_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_HARDSWISH_LOONGARCH_H
+#define LAYER_HARDSWISH_LOONGARCH_H
+
+#include "hardswish.h"
+
+namespace ncnn {
+
+class HardSwish_loongarch : virtual public HardSwish
+{
+public:
+    HardSwish_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_HARDSWISH_LOONGARCH_H
diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp
new file mode 100644
index 00000000000..3dd6ff35e23
--- /dev/null
+++ b/src/layer/loongarch/innerproduct_loongarch.cpp
@@ -0,0 +1,1637 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "innerproduct_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+
+namespace ncnn {
+
+InnerProduct_loongarch::InnerProduct_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+
+    flatten = 0;
+}
+
+int InnerProduct_loongarch::create_pipeline(const Option& opt)
+{
+    {
+        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+
+        ncnn::ParamDict pd;
+
+        flatten->load_param(pd);
+
+        flatten->create_pipeline(opt);
+    }
+
+#if NCNN_INT8
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8_loongarch(opt);
+    }
+#endif
+
+#if __loongarch_sx
+    if (opt.use_fp16_storage)
+    {
+        return create_pipeline_fp16s(opt);
+    }
+#endif
+
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+    if (out_elempack == 4)
+    {
+        // src = inch-outch
+        // dst = 4-inch-outch/4
+        {
+            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+            weight_data_tm.create(num_input, num_output / 4, (size_t)4u * 4, 4);
+
+            for (int q = 0; q + 3 < num_output; q += 4)
+            {
+                float* g0 = weight_data_tm.row(q / 4);
+
+                for (int p = 0; p < num_input; p++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        *g0++ = weight_data_r2.row(q + j)[p];
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        weight_data_tm = weight_data;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::destroy_pipeline(const Option& opt)
+{
+    if (flatten)
+    {
+        flatten->destroy_pipeline(opt);
+        delete flatten;
+        flatten = 0;
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if NCNN_INT8
+    if (opt.use_int8_inference && int8_scale_term)
+    {
+        return forward_int8_loongarch(bottom_blob, top_blob, opt);
+    }
+#endif
+
+#if __loongarch_sx
+    if (opt.use_fp16_storage)
+    {
+        return forward_fp16s(bottom_blob, top_blob, opt);
+    }
+#endif
+
+    const int num_input = weight_data_size / num_output;
+
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
+    {
+        // gemm
+        int h = bottom_blob.h;
+        size_t elemsize = bottom_blob.elemsize;
+        int elempack = bottom_blob.elempack;
+
+        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+#if __loongarch_sx
+            if (elempack == 4 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const float* kptr = weight_data_tm.row(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 0]);
+                        _sum1 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 1]);
+                        _sum2 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 2]);
+                        _sum3 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 3]);
+                    }
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0);
+                        _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1);
+                        _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2);
+                        _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    __lsx_vst(_sum1, outptr + 4, 0);
+                    __lsx_vst(_sum2, outptr + 8, 0);
+                    __lsx_vst(_sum3, outptr + 12, 0);
+                    outptr += 16;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const float* kptr = weight_data_tm.row(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    int i = 0;
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 64);
+                        __m128i _val = __lsx_vld(m, 0);
+                        __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                        __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                        __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                        __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+                        _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                        _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                        _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                        _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                        m += 4;
+                        kptr += 16;
+                    }
+                    for (; i < num_input; i++)
+                    {
+                        __m128 _val = __lsx_vreplfr2vr_s(m[0]);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                        m += 1;
+                        kptr += 4;
+                    }
+
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+                    _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    outptr += 4;
+                }
+            }
+
+            if (elempack == 4 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const float* kptr = (const float*)weight_data_tm + num_input * p;
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = __lsx_vreplfr2vr_s(bias_data[p]);
+                    }
+
+                    for (int i = 0; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 4);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128 _k = __lsx_vreplfr2vr_s(kptr[0]);
+                        _sum = __lsx_vfmadd_s(_k, _val, _sum);
+
+                        m += 4;
+                        kptr += 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+#endif // __loongarch_sx
+
+            if (elempack == 1 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const float* kptr = (const float*)weight_data_tm + num_input * p;
+                    const float* m = bottom_blob.row(j);
+
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    int i = 0;
+#if __loongarch_sx
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _m = (__m128)__lsx_vld(m, 0);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum = __lsx_vfmadd_s(_w, _m, _sum);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+                    sum += __lsx_reduce_fadd_s(_sum);
+#endif // __loongarch_sx
+                    for (; i < num_input; i++)
+                    {
+                        sum += *m * *kptr;
+
+                        m += 1;
+                        kptr += 1;
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[0] = sum;
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // flatten
+    Mat bottom_blob_flattened = bottom_blob;
+    if (bottom_blob.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+
+        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
+    }
+
+    size_t elemsize = bottom_blob_flattened.elemsize;
+    int elempack = bottom_blob_flattened.elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (out_elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+            if (bias_term)
+            {
+                _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+            }
+
+            const float* kptr = weight_data_tm.row(p);
+
+            const float* sptr = bottom_blob_flattened;
+
+            int i = 0;
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(sptr + 16);
+                __builtin_prefetch(kptr + 64);
+                __m128i _val = __lsx_vld(sptr, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                sptr += 4;
+                kptr += 16;
+            }
+            for (; i < num_input; i++)
+            {
+                __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
+                __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                sptr += 1;
+                kptr += 4;
+            }
+
+            _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+            _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+            _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            __lsx_vst(_sum0, outptr + p * 4, 0);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (out_elempack == 1)
+    {
+        int nn_num_output = num_output / 4;
+        int remain_num_output_start = nn_num_output * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_num_output; pp++)
+        {
+            int p = pp * 4;
+
+            float sum0 = 0.f;
+            float sum1 = 0.f;
+            float sum2 = 0.f;
+            float sum3 = 0.f;
+
+            if (bias_term)
+            {
+                sum0 = bias_data[p];
+                sum1 = bias_data[p + 1];
+                sum2 = bias_data[p + 2];
+                sum3 = bias_data[p + 3];
+            }
+
+            const float* w0 = (const float*)weight_data_tm + num_input * p;
+            const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
+            const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
+            const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w0 + 16);
+                __builtin_prefetch(w1 + 16);
+                __builtin_prefetch(w2 + 16);
+                __builtin_prefetch(w3 + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w0 = (__m128)__lsx_vld(w0, 0);
+                __m128 _w1 = (__m128)__lsx_vld(w1, 0);
+                __m128 _w2 = (__m128)__lsx_vld(w2, 0);
+                __m128 _w3 = (__m128)__lsx_vld(w3, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3);
+
+                m += 4;
+                w0 += 4;
+                w1 += 4;
+                w2 += 4;
+                w3 += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < num_input; i++)
+            {
+                sum0 += *m * *w0;
+                sum1 += *m * *w1;
+                sum2 += *m * *w2;
+                sum3 += *m * *w3;
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+            }
+
+#if __loongarch_sx
+            sum0 += __lsx_reduce_fadd_s(_sum0);
+            sum1 += __lsx_reduce_fadd_s(_sum1);
+            sum2 += __lsx_reduce_fadd_s(_sum2);
+            sum3 += __lsx_reduce_fadd_s(_sum3);
+#endif // __loongarch_sx
+
+            sum0 = activation_ss(sum0, activation_type, activation_params);
+            sum1 = activation_ss(sum1, activation_type, activation_params);
+            sum2 = activation_ss(sum2, activation_type, activation_params);
+            sum3 = activation_ss(sum3, activation_type, activation_params);
+
+            top_blob[p] = sum0;
+            top_blob[p + 1] = sum1;
+            top_blob[p + 2] = sum2;
+            top_blob[p + 3] = sum3;
+        }
+
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = remain_num_output_start; p < num_output; p++)
+        {
+            float sum = 0.f;
+
+            if (bias_term)
+                sum = bias_data[p];
+
+            const float* w = (const float*)weight_data_tm + num_input * p;
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w = (__m128)__lsx_vld(w, 0);
+                _sum0 = __lsx_vfmadd_s(_w, _m, _sum0);
+
+                m += 4;
+                w += 4;
+            }
+            sum += __lsx_reduce_fadd_s(_sum0);
+#endif // __loongarch_sx
+            for (; i < num_input; i++)
+            {
+                sum += *m * *w;
+
+                m++;
+                w++;
+            }
+
+            sum = activation_ss(sum, activation_type, activation_params);
+
+            top_blob[p] = sum;
+        }
+    }
+
+    return 0;
+}
+
+#if __loongarch_sx
+int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt)
+{
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    if (out_elempack == 4)
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4);
+
+        for (int q = 0; q + 3 < num_output; q += 4)
+        {
+            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 4);
+
+            const float* k0 = weight_data_r2.row(q);
+            const float* k1 = weight_data_r2.row(q + 1);
+            const float* k2 = weight_data_r2.row(q + 2);
+            const float* k3 = weight_data_r2.row(q + 3);
+
+            int p = 0;
+            for (; p + 3 < num_input; p += 4)
+            {
+                // transpose 4x4
+                __m128i _r0 = __lsx_vld(k0, 0);
+                __m128i _r1 = __lsx_vld(k1, 0);
+                __m128i _r2 = __lsx_vld(k2, 0);
+                __m128i _r3 = __lsx_vld(k3, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                __m128i _p0 = __lsx_vfcvt_h_s((__m128)_r0123_1, (__m128)_r0123_0);
+                __m128i _p1 = __lsx_vfcvt_h_s((__m128)_r0123_3, (__m128)_r0123_2);
+
+                __lsx_vst(_p0, g0, 0);
+                __lsx_vst(_p1, g0 + 8, 0);
+
+                k0 += 4;
+                k1 += 4;
+                k2 += 4;
+                k3 += 4;
+                g0 += 16;
+            }
+            for (; p < num_input; p++)
+            {
+                g0[0] = float32_to_float16(*k0++);
+                g0[1] = float32_to_float16(*k1++);
+                g0[2] = float32_to_float16(*k2++);
+                g0[3] = float32_to_float16(*k3++);
+                g0 += 4;
+            }
+        }
+    }
+
+    if (out_elempack == 1)
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int num_input = weight_data_size / num_output;
+
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
+    {
+        // gemm
+        int h = bottom_blob.h;
+        size_t elemsize = bottom_blob.elemsize;
+        int elempack = bottom_blob.elempack;
+
+        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+            if (elempack == 4 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 0]);
+                        _sum1 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 1]);
+                        _sum2 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 2]);
+                        _sum3 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 3]);
+                    }
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128i _w = (__m128i)__lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                        _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0);
+                        _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1);
+                        _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2);
+                        _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    __lsx_vst(_sum1, outptr + 4, 0);
+                    __lsx_vst(_sum2, outptr + 8, 0);
+                    __lsx_vst(_sum3, outptr + 12, 0);
+                    outptr += 16;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    int i = 0;
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 64);
+                        __m128i _val = __lsx_vld(m, 0);
+                        __m128i _w01 = __lsx_vld(kptr, 0);
+                        __m128i _w23 = __lsx_vld(kptr + 8, 0);
+                        __m128 _w0 = __lsx_vfcvtl_s_h(_w01);
+                        __m128 _w1 = __lsx_vfcvth_s_h(_w01);
+                        __m128 _w2 = __lsx_vfcvtl_s_h(_w23);
+                        __m128 _w3 = __lsx_vfcvth_s_h(_w23);
+                        _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                        _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                        _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                        _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                        m += 4;
+                        kptr += 16;
+                    }
+                    for (; i < num_input; i++)
+                    {
+                        __m128 _val = __lsx_vreplfr2vr_s(m[0]);
+                        __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                        _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                        m += 1;
+                        kptr += 4;
+                    }
+
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+                    _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    outptr += 4;
+                }
+            }
+
+            if (elempack == 4 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = __lsx_vreplfr2vr_s(bias_data[p]);
+                    }
+
+                    for (int i = 0; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 4);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128 _k = __lsx_vreplfr2vr_s(float16_to_float32(kptr[0]));
+                        _sum = __lsx_vfmadd_s(_k, _val, _sum);
+
+                        m += 4;
+                        kptr += 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    int i = 0;
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _m = (__m128)__lsx_vld(m, 0);
+                        __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                        _sum = __lsx_vfmadd_s(_w, _m, _sum);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+                    sum += __lsx_reduce_fadd_s(_sum);
+                    for (; i < num_input; i++)
+                    {
+                        sum += *m * float16_to_float32(*kptr);
+
+                        m += 1;
+                        kptr += 1;
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[0] = sum;
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // flatten
+    Mat bottom_blob_flattened = bottom_blob;
+    if (bottom_blob.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+
+        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
+    }
+
+    size_t elemsize = bottom_blob_flattened.elemsize;
+    int elempack = bottom_blob_flattened.elempack;
+
+    int out_elempack = 1;
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (out_elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+            if (bias_term)
+            {
+                _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+            }
+
+            const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+
+            const float* sptr = bottom_blob_flattened;
+
+            int i = 0;
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(sptr + 16);
+                __builtin_prefetch(kptr + 64);
+                __m128i _val = __lsx_vld(sptr, 0);
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 8, 0);
+                __m128 _w0 = __lsx_vfcvtl_s_h(_w01);
+                __m128 _w1 = __lsx_vfcvth_s_h(_w01);
+                __m128 _w2 = __lsx_vfcvtl_s_h(_w23);
+                __m128 _w3 = __lsx_vfcvth_s_h(_w23);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                sptr += 4;
+                kptr += 16;
+            }
+            for (; i < num_input; i++)
+            {
+                __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
+                __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                sptr += 1;
+                kptr += 4;
+            }
+
+            _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+            _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+            _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            __lsx_vst(_sum0, outptr + p * 4, 0);
+        }
+    }
+
+    if (out_elempack == 1)
+    {
+        int nn_num_output = num_output / 4;
+        int remain_num_output_start = nn_num_output * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_num_output; pp++)
+        {
+            int p = pp * 4;
+
+            float sum0 = 0.f;
+            float sum1 = 0.f;
+            float sum2 = 0.f;
+            float sum3 = 0.f;
+
+            if (bias_term)
+            {
+                sum0 = bias_data[p];
+                sum1 = bias_data[p + 1];
+                sum2 = bias_data[p + 2];
+                sum3 = bias_data[p + 3];
+            }
+
+            const unsigned short* w0 = weight_data_tm.row<const unsigned short>(p);
+            const unsigned short* w1 = weight_data_tm.row<const unsigned short>(p + 1);
+            const unsigned short* w2 = weight_data_tm.row<const unsigned short>(p + 2);
+            const unsigned short* w3 = weight_data_tm.row<const unsigned short>(p + 3);
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w0 + 16);
+                __builtin_prefetch(w1 + 16);
+                __builtin_prefetch(w2 + 16);
+                __builtin_prefetch(w3 + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w0 = __lsx_vfcvtl_s_h(__lsx_vld(w0, 0));
+                __m128 _w1 = __lsx_vfcvtl_s_h(__lsx_vld(w1, 0));
+                __m128 _w2 = __lsx_vfcvtl_s_h(__lsx_vld(w2, 0));
+                __m128 _w3 = __lsx_vfcvtl_s_h(__lsx_vld(w3, 0));
+                _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3);
+
+                m += 4;
+                w0 += 4;
+                w1 += 4;
+                w2 += 4;
+                w3 += 4;
+            }
+            for (; i < num_input; i++)
+            {
+                sum0 += *m * float16_to_float32(*w0);
+                sum1 += *m * float16_to_float32(*w1);
+                sum2 += *m * float16_to_float32(*w2);
+                sum3 += *m * float16_to_float32(*w3);
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+            }
+
+            sum0 += __lsx_reduce_fadd_s(_sum0);
+            sum1 += __lsx_reduce_fadd_s(_sum1);
+            sum2 += __lsx_reduce_fadd_s(_sum2);
+            sum3 += __lsx_reduce_fadd_s(_sum3);
+
+            sum0 = activation_ss(sum0, activation_type, activation_params);
+            sum1 = activation_ss(sum1, activation_type, activation_params);
+            sum2 = activation_ss(sum2, activation_type, activation_params);
+            sum3 = activation_ss(sum3, activation_type, activation_params);
+
+            top_blob[p] = sum0;
+            top_blob[p + 1] = sum1;
+            top_blob[p + 2] = sum2;
+            top_blob[p + 3] = sum3;
+        }
+
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = remain_num_output_start; p < num_output; p++)
+        {
+            float sum = 0.f;
+
+            if (bias_term)
+                sum = bias_data[p];
+
+            const unsigned short* w = weight_data_tm.row<const unsigned short>(p);
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(w, 0));
+                _sum0 = __lsx_vfmadd_s(_w, _m, _sum0);
+
+                m += 4;
+                w += 4;
+            }
+            sum += __lsx_reduce_fadd_s(_sum0);
+            for (; i < num_input; i++)
+            {
+                sum += *m * float16_to_float32(*w);
+
+                m++;
+                w++;
+            }
+
+            sum = activation_ss(sum, activation_type, activation_params);
+
+            top_blob[p] = sum;
+        }
+    }
+
+    return 0;
+}
+#endif // __loongarch_sx
+
+#if NCNN_INT8
+int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt)
+{
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif // __loongarch_sx
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g0 = weight_data_tm.row<signed char>(q / out_elempack);
+
+            for (int p = 0; p < num_input; p++)
+            {
+                for (int j = 0; j < out_elempack; j++)
+                {
+                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
+                }
+            }
+        }
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // dequantize
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int num_input = weight_data_size / num_output;
+
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input && bottom_blob_int8.h * bottom_blob_int8.elempack > 1)
+    {
+        // gemm
+        Mat bottom_blob_int8_unpacked;
+        Option opt_unpack = opt;
+        opt_unpack.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);
+
+        int h = bottom_blob_int8_unpacked.h;
+
+        int out_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            out_elempack = h % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        int outh = h / out_elempack;
+
+        top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 8 == 0 ? 8 : 1;
+        }
+#endif
+
+#if __loongarch_sx
+        if (num_output_elempack == 8 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum20 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum21 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum30 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum31 = __lsx_vreplgr2vr_w(0);
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m0 + 4);
+                        __builtin_prefetch(m1 + 4);
+                        __builtin_prefetch(m2 + 4);
+                        __builtin_prefetch(m3 + 4);
+                        __builtin_prefetch(kptr + 32);
+                        __m128i _val0 = __lsx_vreplgr2vr_h((short)m0[0]);
+                        __m128i _val1 = __lsx_vreplgr2vr_h((short)m1[0]);
+                        __m128i _val2 = __lsx_vreplgr2vr_h((short)m2[0]);
+                        __m128i _val3 = __lsx_vreplgr2vr_h((short)m3[0]);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val0, _w16);
+                        __m128i _s1 = __lsx_vmul_h(_val1, _w16);
+                        __m128i _s2 = __lsx_vmul_h(_val2, _w16);
+                        __m128i _s3 = __lsx_vmul_h(_val3, _w16);
+                        __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                        __m128i _exts1 = __lsx_vslti_h(_s1, 0);
+                        __m128i _exts2 = __lsx_vslti_h(_s2, 0);
+                        __m128i _exts3 = __lsx_vslti_h(_s3, 0);
+                        __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                        __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+                        __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
+                        __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);
+                        __m128i _s2l = __lsx_vilvl_h(_exts2, _s2);
+                        __m128i _s2h = __lsx_vilvh_h(_exts2, _s2);
+                        __m128i _s3l = __lsx_vilvl_h(_exts3, _s3);
+                        __m128i _s3h = __lsx_vilvh_h(_exts3, _s3);
+
+                        _sum00 = __lsx_vadd_w(_sum00, _s0l);
+                        _sum01 = __lsx_vadd_w(_sum01, _s0h);
+                        _sum10 = __lsx_vadd_w(_sum10, _s1l);
+                        _sum11 = __lsx_vadd_w(_sum11, _s1h);
+                        _sum20 = __lsx_vadd_w(_sum20, _s2l);
+                        _sum21 = __lsx_vadd_w(_sum21, _s2h);
+                        _sum30 = __lsx_vadd_w(_sum30, _s3l);
+                        _sum31 = __lsx_vadd_w(_sum31, _s3h);
+
+                        m0++;
+                        m1++;
+                        m2++;
+                        m3++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
+                    __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);
+
+                    __m128 _sumfp32_00 = __lsx_vffint_s_w(_sum00);
+                    __m128 _sumfp32_01 = __lsx_vffint_s_w(_sum01);
+                    __m128 _sumfp32_10 = __lsx_vffint_s_w(_sum10);
+                    __m128 _sumfp32_11 = __lsx_vffint_s_w(_sum11);
+                    __m128 _sumfp32_20 = __lsx_vffint_s_w(_sum20);
+                    __m128 _sumfp32_21 = __lsx_vffint_s_w(_sum21);
+                    __m128 _sumfp32_30 = __lsx_vffint_s_w(_sum30);
+                    __m128 _sumfp32_31 = __lsx_vffint_s_w(_sum31);
+                    if (bias_term)
+                    {
+                        __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
+                        __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
+                        _sumfp32_00 = __lsx_vfmadd_s(_scale_in0, _sumfp32_00, _bias0);
+                        _sumfp32_01 = __lsx_vfmadd_s(_scale_in1, _sumfp32_01, _bias1);
+                        _sumfp32_10 = __lsx_vfmadd_s(_scale_in0, _sumfp32_10, _bias0);
+                        _sumfp32_11 = __lsx_vfmadd_s(_scale_in1, _sumfp32_11, _bias1);
+                        _sumfp32_20 = __lsx_vfmadd_s(_scale_in0, _sumfp32_20, _bias0);
+                        _sumfp32_21 = __lsx_vfmadd_s(_scale_in1, _sumfp32_21, _bias1);
+                        _sumfp32_30 = __lsx_vfmadd_s(_scale_in0, _sumfp32_30, _bias0);
+                        _sumfp32_31 = __lsx_vfmadd_s(_scale_in1, _sumfp32_31, _bias1);
+                    }
+                    else
+                    {
+                        _sumfp32_00 = __lsx_vfmul_s(_sumfp32_00, _scale_in0);
+                        _sumfp32_01 = __lsx_vfmul_s(_sumfp32_01, _scale_in1);
+                        _sumfp32_10 = __lsx_vfmul_s(_sumfp32_10, _scale_in0);
+                        _sumfp32_11 = __lsx_vfmul_s(_sumfp32_11, _scale_in1);
+                        _sumfp32_20 = __lsx_vfmul_s(_sumfp32_20, _scale_in0);
+                        _sumfp32_21 = __lsx_vfmul_s(_sumfp32_21, _scale_in1);
+                        _sumfp32_30 = __lsx_vfmul_s(_sumfp32_30, _scale_in0);
+                        _sumfp32_31 = __lsx_vfmul_s(_sumfp32_31, _scale_in1);
+                    }
+
+                    _sumfp32_00 = activation_ps(_sumfp32_00, activation_type, activation_params);
+                    _sumfp32_01 = activation_ps(_sumfp32_01, activation_type, activation_params);
+                    _sumfp32_10 = activation_ps(_sumfp32_10, activation_type, activation_params);
+                    _sumfp32_11 = activation_ps(_sumfp32_11, activation_type, activation_params);
+                    _sumfp32_20 = activation_ps(_sumfp32_20, activation_type, activation_params);
+                    _sumfp32_21 = activation_ps(_sumfp32_21, activation_type, activation_params);
+                    _sumfp32_30 = activation_ps(_sumfp32_30, activation_type, activation_params);
+                    _sumfp32_31 = activation_ps(_sumfp32_31, activation_type, activation_params);
+
+                    // transpose 4x8
+                    __m128i _r01r = __lsx_vilvl_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00);
+                    __m128i _r01l = __lsx_vilvh_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00);
+                    __m128i _r23r = __lsx_vilvl_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20);
+                    __m128i _r23l = __lsx_vilvh_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20);
+                    __m128i _r45r = __lsx_vilvl_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01);
+                    __m128i _r45l = __lsx_vilvh_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01);
+                    __m128i _r67r = __lsx_vilvl_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21);
+                    __m128i _r67l = __lsx_vilvh_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21);
+                    _sumfp32_00 = (__m128)__lsx_vilvl_d(_r23r, _r01r);
+                    _sumfp32_10 = (__m128)__lsx_vilvh_d(_r23r, _r01r);
+                    _sumfp32_20 = (__m128)__lsx_vilvl_d(_r23l, _r01l);
+                    _sumfp32_30 = (__m128)__lsx_vilvh_d(_r23l, _r01l);
+                    _sumfp32_01 = (__m128)__lsx_vilvl_d(_r67r, _r45r);
+                    _sumfp32_11 = (__m128)__lsx_vilvh_d(_r67r, _r45r);
+                    _sumfp32_21 = (__m128)__lsx_vilvl_d(_r67l, _r45l);
+                    _sumfp32_31 = (__m128)__lsx_vilvh_d(_r67l, _r45l);
+
+                    __lsx_vst(_sumfp32_00, outptr, 0);
+                    __lsx_vst(_sumfp32_10, outptr + 4, 0);
+                    __lsx_vst(_sumfp32_20, outptr + 8, 0);
+                    __lsx_vst(_sumfp32_30, outptr + 12, 0);
+                    __lsx_vst(_sumfp32_01, outptr + 16, 0);
+                    __lsx_vst(_sumfp32_11, outptr + 20, 0);
+                    __lsx_vst(_sumfp32_21, outptr + 24, 0);
+                    __lsx_vst(_sumfp32_31, outptr + 28, 0);
+
+                    outptr += 32;
+                }
+            }
+        }
+
+        if (num_output_elempack == 1 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    int sum0 = 0;
+                    int sum1 = 0;
+                    int sum2 = 0;
+                    int sum3 = 0;
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        sum0 += *m0++ * kptr[0];
+                        sum1 += *m1++ * kptr[0];
+                        sum2 += *m2++ * kptr[0];
+                        sum3 += *m3++ * kptr[0];
+                        kptr += 1;
+                    }
+
+                    // dequantize and relu
+                    float sumfp32_0 = sum0 * scale_in_data[p];
+                    float sumfp32_1 = sum1 * scale_in_data[p];
+                    float sumfp32_2 = sum2 * scale_in_data[p];
+                    float sumfp32_3 = sum3 * scale_in_data[p];
+
+                    if (bias_term)
+                    {
+                        sumfp32_0 += bias_data[p];
+                        sumfp32_1 += bias_data[p];
+                        sumfp32_2 += bias_data[p];
+                        sumfp32_3 += bias_data[p];
+                    }
+
+                    outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params);
+                    outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params);
+                    outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params);
+                    outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params);
+                    outptr += 4;
+                }
+            }
+        }
+
+        if (num_output_elempack == 8 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 4);
+                        __builtin_prefetch(kptr + 32);
+                        __m128i _val = __lsx_vreplgr2vr_h((short)m[0]);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                        __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                        __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                        __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                        _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                        _sum1 = __lsx_vadd_w(_sum1, _s0h);
+
+                        m++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
+                    __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);
+
+                    __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0);
+                    __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1);
+
+                    if (bias_term)
+                    {
+                        __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
+                        __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
+                        _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0);
+                        _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1);
+                    }
+                    else
+                    {
+                        _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0);
+                        _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1);
+                    }
+
+                    _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
+                    _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);
+
+                    __lsx_vst(_sumfp32_0, outptr, 0);
+                    __lsx_vst(_sumfp32_1, outptr + 4, 0);
+                    outptr += 8;
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (num_output_elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    int sum = 0;
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        sum += *m++ * *kptr++;
+                    }
+
+                    // dequantize and relu
+                    float sumfp32 = sum * scale_in_data[p];
+
+                    if (bias_term)
+                        sumfp32 += bias_data[p];
+
+                    outptr[0] = activation_ss(sumfp32, activation_type, activation_params);
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    Mat bottom_blob_int8_flattened = bottom_blob_int8;
+    if (bottom_blob_int8.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
+    }
+
+    //     int elempack = bottom_blob_int8_flattened.elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif // __loongarch_sx
+    //     size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (out_elempack == 8)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                __builtin_prefetch(sptr + 4);
+                __builtin_prefetch(kptr + 32);
+                __m128i _val = __lsx_vreplgr2vr_h((short)sptr[0]);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                _sum1 = __lsx_vadd_w(_sum1, _s0h);
+
+                sptr += 1;
+                kptr += 8;
+            }
+
+            // dequantize and relu
+            __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
+            __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);
+
+            __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0);
+            __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1);
+
+            if (bias_term)
+            {
+                __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
+                __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
+                _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0);
+                _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1);
+            }
+            else
+            {
+                _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0);
+                _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1);
+            }
+
+            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
+            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);
+
+            float* outptr = (float*)top_blob + p * 8;
+            __lsx_vst(_sumfp32_0, outptr, 0);
+            __lsx_vst(_sumfp32_1, outptr + 4, 0);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (out_elempack == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            int sum = 0;
+
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                signed char val = sptr[0];
+
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                sptr += 1;
+                kptr += 1;
+            }
+
+            // dequantize and relu
+            float sumfp32 = sum * scale_in_data[p];
+
+            if (bias_term)
+                sumfp32 += bias_data[p];
+
+            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+            top_blob[p] = sumfp32;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/innerproduct_loongarch.h b/src/layer/loongarch/innerproduct_loongarch.h
new file mode 100644
index 00000000000..4d9574ce919
--- /dev/null
+++ b/src/layer/loongarch/innerproduct_loongarch.h
@@ -0,0 +1,54 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INNERPRODUCT_LOONGARCH_H
+#define LAYER_INNERPRODUCT_LOONGARCH_H
+
+#include "innerproduct.h"
+
+namespace ncnn {
+
+class InnerProduct_loongarch : virtual public InnerProduct
+{
+public:
+    InnerProduct_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+#if __loongarch_sx
+    int create_pipeline_fp16s(const Option& opt);
+    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+#if NCNN_INT8
+    int create_pipeline_int8_loongarch(const Option& opt);
+    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
+public:
+    Layer* flatten;
+
+    Mat weight_data_tm;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INNERPRODUCT_LOONGARCH_H
diff --git a/src/layer/loongarch/interp_bicubic.h b/src/layer/loongarch/interp_bicubic.h
new file mode 100644
index 00000000000..e52ba81de4f
--- /dev/null
+++ b/src/layer/loongarch/interp_bicubic.h
@@ -0,0 +1,261 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static inline void interpolate_cubic(float fx, float* coeffs)
+{
+    const float A = -0.75f;
+
+    float fx0 = fx + 1;
+    float fx1 = fx;
+    float fx2 = 1 - fx;
+    // float fx3 = 2 - fx;
+
+    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
+    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
+    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
+{
+    double scale = (double)w / outw;
+    if (align_corner)
+    {
+        scale = (double)(w - 1) / (outw - 1);
+    }
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        if (align_corner)
+        {
+            fx = (float)(dx * scale);
+        }
+
+        int sx = static_cast<int>(floor(fx));
+        fx -= sx;
+
+        interpolate_cubic(fx, alpha + dx * 4);
+
+        if (sx <= -1)
+        {
+            sx = 1;
+            alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 3];
+            alpha[dx * 4 + 2] = 0.f;
+            alpha[dx * 4 + 3] = 0.f;
+        }
+        if (sx == 0)
+        {
+            sx = 1;
+            alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 2];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 3];
+            alpha[dx * 4 + 3] = 0.f;
+        }
+        if (sx == w - 2)
+        {
+            sx = w - 3;
+            alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 1];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 0];
+            alpha[dx * 4 + 0] = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 3;
+            alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 0];
+            alpha[dx * 4 + 1] = 0.f;
+            alpha[dx * 4 + 0] = 0.f;
+        }
+
+        xofs[dx] = sx;
+    }
+}
+
+static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    Mat rowsbuf2(w);
+    Mat rowsbuf3(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy - 1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3;
+                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+        float b2 = beta[2];
+        float b3 = beta[3];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+        for (int dx = 0; dx < w; dx++)
+        {
+            //             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/loongarch/interp_bicubic_pack4.h b/src/layer/loongarch/interp_bicubic_pack4.h
new file mode 100644
index 00000000000..54281691ad7
--- /dev/null
+++ b/src/layer/loongarch/interp_bicubic_pack4.h
@@ -0,0 +1,286 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
+                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
+                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
+                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows2, rows2p + dx * 4, 0);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0);
+                __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0);
+                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
+                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
+                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
+                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+                __lsx_vst(_rows2, rows2p + dx * 4, 0);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy - 1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S00 = (__m128)__lsx_vld(S0p - 4, 0);
+                __m128 _S01 = (__m128)__lsx_vld(S0p + 0, 0);
+                __m128 _S02 = (__m128)__lsx_vld(S0p + 4, 0);
+                __m128 _S03 = (__m128)__lsx_vld(S0p + 8, 0);
+                __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0);
+                __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0);
+                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
+                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
+                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
+                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows0 = __lsx_vfmul_s(_S00, _a0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows0 = __lsx_vfmadd_s(_a2, _S02, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows0 = __lsx_vfmadd_s(_a3, _S03, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows0, rows0p + dx * 4, 0);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+                __lsx_vst(_rows2, rows2p + dx * 4, 0);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]);
+        __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]);
+        __m128 _b2 = __lsx_vreplfr2vr_s(beta[2]);
+        __m128 _b3 = __lsx_vreplfr2vr_s(beta[3]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
+            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
+            __m128 _rows2 = (__m128)__lsx_vld(rows2p, 0);
+            __m128 _rows3 = (__m128)__lsx_vld(rows3p, 0);
+            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
+            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
+            _D = __lsx_vfmadd_s(_b2, _rows2, _D);
+            _D = __lsx_vfmadd_s(_b3, _rows3, _D);
+            __lsx_vst(_D, Dp, 0);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+            rows2p += 4;
+            rows3p += 4;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/loongarch/interp_bilinear.h b/src/layer/loongarch/interp_bilinear.h
new file mode 100644
index 00000000000..ad5a28672be
--- /dev/null
+++ b/src/layer/loongarch/interp_bilinear.h
@@ -0,0 +1,172 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
+{
+    double scale = (double)w / outw;
+    if (align_corner)
+    {
+        scale = (double)(w - 1) / (outw - 1);
+    }
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        if (align_corner)
+        {
+            fx = (float)(dx * scale);
+        }
+
+        int sx = floor(fx);
+        fx -= sx;
+
+        if (sx < 0)
+        {
+            sx = 0;
+            fx = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 2;
+            fx = 1.f;
+        }
+
+        xofs[dx] = sx;
+
+        alpha[dx * 2] = 1.f - fx;
+        alpha[dx * 2 + 1] = fx;
+    }
+}
+
+static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+#if __loongarch_sx
+        int nn = w >> 3;
+#else
+        int nn = 0;
+#endif
+        int remain = w - (nn << 3);
+
+#if __loongarch_sx
+        __m128 _b0 = __lsx_vreplfr2vr_s(b0);
+        __m128 _b1 = __lsx_vreplfr2vr_s(b1);
+        for (; nn > 0; nn--)
+        {
+            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
+            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
+
+            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
+            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
+
+            __lsx_vst(_D, Dp, 0);
+
+            __m128 _rows0n = (__m128)__lsx_vld(rows0p + 4, 0);
+            __m128 _rows1n = (__m128)__lsx_vld(rows1p + 4, 0);
+
+            __m128 _Dn = __lsx_vfmul_s(_rows0n, _b0);
+            _Dn = __lsx_vfmadd_s(_b1, _rows1n, _Dn);
+
+            __lsx_vst(_Dn, Dp + 4, 0);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#endif // __loongarch_sx
+        for (; remain; --remain)
+        {
+            //             D[x] = rows0[x]*b0 + rows1[x]*b1;
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/loongarch/interp_bilinear_pack4.h b/src/layer/loongarch/interp_bilinear_pack4.h
new file mode 100644
index 00000000000..2cfb138a1cb
--- /dev/null
+++ b/src/layer/loongarch/interp_bilinear_pack4.h
@@ -0,0 +1,123 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+
+                __m128 _S10 = (__m128)__lsx_vld(S1p, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+
+                __m128 _S00 = (__m128)__lsx_vld(S0p, 0);
+                __m128 _S01 = (__m128)__lsx_vld(S0p + 4, 0);
+                __m128 _S10 = (__m128)__lsx_vld(S1p, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _rows0 = __lsx_vfmul_s(_S00, _a0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                __lsx_vst(_rows0, rows0p + dx * 4, 0);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]);
+        __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
+            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
+            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
+            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
+            __lsx_vst(_D, Dp, 0);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/loongarch/interp_loongarch.cpp b/src/layer/loongarch/interp_loongarch.cpp
new file mode 100644
index 00000000000..94d25cf005e
--- /dev/null
+++ b/src/layer/loongarch/interp_loongarch.cpp
@@ -0,0 +1,470 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "interp_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#include "interp_bicubic.h"
+#include "interp_bilinear.h"
+
+#if __loongarch_sx
+#include "interp_bicubic_pack4.h"
+#include "interp_bilinear_pack4.h"
+#endif
+
+Interp_loongarch::Interp_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Interp_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int h = bottom_blob.h;
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = reference_blob.w;
+    int outh = reference_blob.h;
+
+    if (dims == 1)
+    {
+        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < w; q++)
+            {
+                Mat top_blob_c = top_blob.channel(q);
+                __m128 _v = (__m128)__lsx_vld((const float*)bottom_blob + q * 4, 0);
+                top_blob_c.fill(_v);
+            }
+
+            return 0;
+        }
+#endif // __loongarch_sx
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < w; q++)
+        {
+            Mat top_blob_c = top_blob.channel(q);
+            const float v = bottom_blob[q];
+            top_blob_c.fill(v);
+        }
+
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        if (outw == w)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            if (resize_type == 1) // nearest
+            {
+                const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0);
+                        __lsx_vst(_p, outptr, 0);
+
+                        outptr += 4;
+                    }
+                }
+            }
+
+            if (resize_type == 2) // bilinear
+            {
+                int* buf = new int[outw + outw * 2];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 4;
+                        const float* Sp = ptr + sx;
+
+                        __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                        __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+
+                        __m128 _S0 = (__m128)__lsx_vld(Sp, 0);
+                        __m128 _S1 = (__m128)__lsx_vld(Sp + 4, 0);
+                        __m128 _p = __lsx_vfmul_s(_S0, _a0);
+                        _p = __lsx_vfmadd_s(_a1, _S1, _p);
+                        __lsx_vst(_p, outptr, 0);
+
+                        alphap += 2;
+                        outptr += 4;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            if (resize_type == 3) // bicubic
+            {
+                int* buf = new int[outw + outw * 4];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 4;
+                        const float* Sp = ptr + sx;
+
+                        __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                        __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                        __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                        __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                        __m128 _S0 = (__m128)__lsx_vld(Sp - 4, 0);
+                        __m128 _S1 = (__m128)__lsx_vld(Sp + 0, 0);
+                        __m128 _S2 = (__m128)__lsx_vld(Sp + 4, 0);
+                        __m128 _S3 = (__m128)__lsx_vld(Sp + 8, 0);
+                        __m128 _p = __lsx_vfmul_s(_S0, _a0);
+                        _p = __lsx_vfmadd_s(_a1, _S1, _p);
+                        _p = __lsx_vfmadd_s(_a2, _S2, _p);
+                        _p = __lsx_vfmadd_s(_a3, _S3, _p);
+                        __lsx_vst(_p, outptr, 0);
+
+                        alphap += 4;
+                        outptr += 4;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            return 0;
+        }
+#endif // __loongarch_sx
+
+        if (resize_type == 1) // nearest
+        {
+            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outw * 2];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const float* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
+                    alphap += 2;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outw * 4];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const float* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    float a2 = alphap[2];
+                    float a3 = alphap[3];
+                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
+                    alphap += 4;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+
+    if (outw == w && outh == h)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        if (resize_type == 1) // nearest
+        {
+            const float hs = output_height ? h / (float)outh : 1.f / height_scale;
+            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    int in_y = std::min((int)(y * hs), (h - 1));
+
+                    const float* ptr = src.row(in_y);
+                    float* outptr = dst.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0);
+                        __lsx_vst(_p, outptr, 0);
+
+                        outptr += 4;
+                    }
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+            linear_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+            cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (resize_type == 1) // nearest
+    {
+        const float hs = output_height ? h / (float)outh : 1.f / height_scale;
+        const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            for (int y = 0; y < outh; y++)
+            {
+                int in_y = std::min((int)(y * hs), (h - 1));
+
+                const float* ptr = src.row(in_y);
+                float* outptr = dst.row(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+    }
+
+    if (resize_type == 2) // bilinear
+    {
+        int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+        linear_coeffs(w, outw, xofs, alpha, align_corner);
+        linear_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    if (resize_type == 3) // bicubic
+    {
+        int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+        cubic_coeffs(w, outw, xofs, alpha, align_corner);
+        cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/interp_loongarch.h b/src/layer/loongarch/interp_loongarch.h
new file mode 100644
index 00000000000..4c0e0f3dc86
--- /dev/null
+++ b/src/layer/loongarch/interp_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INTERP_LOONGARCH_H
+#define LAYER_INTERP_LOONGARCH_H
+
+#include "interp.h"
+
+namespace ncnn {
+
+class Interp_loongarch : virtual public Interp
+{
+public:
+    Interp_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INTERP_LOONGARCH_H
diff --git a/src/layer/loongarch/loongarch_activation.h b/src/layer/loongarch/loongarch_activation.h
new file mode 100644
index 00000000000..abb268f4bb6
--- /dev/null
+++ b/src/layer/loongarch/loongarch_activation.h
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LOONGARCH_ACTIVATION_H
+#define LOONGARCH_ACTIVATION_H
+
+#include "fused_activation.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+
+static inline __m128 activation_ps(__m128 _v, int activation_type, const ncnn::Mat& activation_params)
+{
+    if (activation_type == 1)
+    {
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        _v = __lsx_vfmax_s(_v, _zero);
+    }
+    else if (activation_type == 2)
+    {
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _slope = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
+        __m128i _lemask = __lsx_vfcmp_cle_s(_v, _zero);
+        __m128 _ps = __lsx_vfmul_s(_v, _slope);
+        _v = (__m128)__lsx_vbitsel_v((__m128i)_v, (__m128i)_ps, (__m128i)_lemask);
+    }
+    else if (activation_type == 3)
+    {
+        __m128 _min = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
+        __m128 _max = (__m128)__lsx_vreplfr2vr_s(activation_params[1]);
+        _v = __lsx_vfmax_s(_v, _min);
+        _v = __lsx_vfmin_s(_v, _max);
+    }
+    else if (activation_type == 4)
+    {
+        _v = sigmoid_ps(_v);
+    }
+    else if (activation_type == 5)
+    {
+        _v = __lsx_vfmul_s(_v, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_v), (__m128)__lsx_vreplfr2vr_s(1.f)))));
+    }
+    else if (activation_type == 6)
+    {
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
+        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(activation_params[1]);
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        __m128 _outp = __lsx_vfmadd_s(_alpha, _v, _beta);
+        _outp = __lsx_vfmax_s(_outp, _zero);
+        _outp = __lsx_vfmin_s(_outp, _one);
+        _v = __lsx_vfmul_s(_outp, _v);
+    }
+
+    return _v;
+}
+#endif // __loongarch_sx
+
+#endif // LOONGARCH_ACTIVATION_H
diff --git a/src/layer/loongarch/loongarch_usability.h b/src/layer/loongarch/loongarch_usability.h
new file mode 100644
index 00000000000..d3ae5dec279
--- /dev/null
+++ b/src/layer/loongarch/loongarch_usability.h
@@ -0,0 +1,236 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LOONGARCH_USABILITY_H
+#define LOONGARCH_USABILITY_H
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include <math.h>
+#include <stdint.h>
+
+namespace ncnn {
+
+typedef union
+{
+    int32_t i;
+    float f;
+} FloatInt;
+
+} // namespace ncnn
+
+#if __loongarch_sx
+/* declare some loongarch constants with union */
+#define _LOONGARCH_FLOAT_CONST(Name, Val) \
+    static const ncnn::FloatInt Name = {.f = Val}
+
+/* float type data load instructions */
+static NCNN_FORCEINLINE __m128 __lsx_vreplfr2vr_s(float val)
+{
+    ncnn::FloatInt fi_tmpval = {.f = val};
+    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+
+static NCNN_FORCEINLINE float __lsx_reduce_fadd_s(__m128 _v)
+{
+    // TODO find a more efficient way
+    float* _v_p = (float*)&_v;
+    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3];
+}
+
+static NCNN_FORCEINLINE int __lsx_reduce_add_w(__m128i _v)
+{
+    // TODO find a more efficient way
+    int* _v_p = (int*)&_v;
+    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3];
+}
+
+#endif // __loongarch_sx
+
+static NCNN_FORCEINLINE signed char float2int8(float v)
+{
+    int int32 = round(v);
+    if (int32 > 127) return 127;
+    if (int32 < -127) return -127;
+    return (signed char)int32;
+}
+
+#if __loongarch_sx
+static NCNN_FORCEINLINE __m128i float2int8(__m128 _v)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
+    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, (__m128i)_sign);
+    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
+    __m128i _v32 = __lsx_vftintrz_w_s(_v5);
+
+    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
+    _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127));
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8;
+}
+
+static NCNN_FORCEINLINE int64_t float2int8(__m128 _vlow, __m128 _vhigh)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
+    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
+    __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow);
+    __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh);
+    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
+    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
+    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
+    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);
+
+    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
+    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
+    _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127));
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8[0];
+}
+
+static NCNN_FORCEINLINE __m128i float2int8relu(__m128 _v)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
+    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign);
+    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
+    __m128i _v32 = __lsx_vftintrz_w_s(_v5);
+
+    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
+    _v16 = __lsx_vmaxi_h(_v16, 0);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8;
+}
+
+static NCNN_FORCEINLINE int64_t float2int8relu(__m128 _vlow, __m128 _vhigh)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
+    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
+    __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow);
+    __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh);
+    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
+    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
+    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
+    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);
+
+    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
+    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
+    _v16 = __lsx_vmaxi_h(_v16, 0);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8[0];
+}
+
+static NCNN_FORCEINLINE __m128i float2int8leakyrelu(__m128 _v, __m128 _slope)
+{
+    __m128 _v_leaky = __lsx_vfmul_s(_v, _slope);
+
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
+    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign);
+    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
+    __m128i _v32 = __lsx_vftintrz_w_s(_v5);
+
+    __m128i _sign_leaky = __lsx_vand_v((__m128i)_v_leaky, _signmask);
+    __m128 _p5_leaky = (__m128)__lsx_vor_v((__m128i)_p5, _sign_leaky);
+    __m128 _v5_leaky = __lsx_vfadd_s(_v_leaky, _p5_leaky);
+    __m128i _v32_leaky = __lsx_vftintrz_w_s(_v5_leaky);
+
+    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
+
+    __m128i _v32_16_leaky = __lsx_vsat_w(_v32_leaky, 15);
+    __m128i _v16_leaky = __lsx_vpickev_h(_v32_16_leaky, _v32_16_leaky);
+
+    _v16 = __lsx_vmax_h(_v16, _v16_leaky);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8;
+}
+
+static NCNN_FORCEINLINE int64_t float2int8leakyrelu(__m128 _vlow, __m128 _vhigh, __m128 _slope)
+{
+    __m128 _vlow_leaky = __lsx_vfmul_s(_vlow, _slope);
+    __m128 _vhigh_leaky = __lsx_vfmul_s(_vhigh, _slope);
+
+    // simulate round to nearest via +/-0.5
+    __m128i _p5 = (__m128i)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
+    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
+    __m128 _p5low = (__m128)__lsx_vor_v(_p5, _signlow);
+    __m128 _p5high = (__m128)__lsx_vor_v(_p5, _signhigh);
+    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
+    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
+    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
+    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);
+
+    __m128i _signlow_leaky = __lsx_vand_v((__m128i)_vlow_leaky, _signmask);
+    __m128i _signhigh_leaky = __lsx_vand_v((__m128i)_vhigh_leaky, _signmask);
+    __m128 _p5low_leaky = (__m128)__lsx_vor_v(_p5, _signlow_leaky);
+    __m128 _p5high_leaky = (__m128)__lsx_vor_v(_p5, _signhigh_leaky);
+    __m128 _vlow5_leaky = __lsx_vfadd_s(_vlow_leaky, _p5low_leaky);
+    __m128 _vhigh5_leaky = __lsx_vfadd_s(_vhigh_leaky, _p5high_leaky);
+    __m128i _vlow32_leaky = __lsx_vftintrz_w_s(_vlow5_leaky);
+    __m128i _vhigh32_leaky = __lsx_vftintrz_w_s(_vhigh5_leaky);
+
+    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
+    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
+
+    __m128i _vlow32_16_leaky = __lsx_vsat_w(_vlow32_leaky, 15);
+    __m128i _vhigh32_16_leaky = __lsx_vsat_w(_vhigh32_leaky, 15);
+    __m128i _v16_leaky = __lsx_vpickev_h(_vhigh32_16_leaky, _vlow32_16_leaky);
+
+    _v16 = __lsx_vmax_h(_v16, _v16_leaky);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8[0];
+}
+#endif // __loongarch_sx
+
+#endif // LOONGARCH_USABILITY_H
diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
new file mode 100644
index 00000000000..ededa596659
--- /dev/null
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -0,0 +1,258 @@
+/* LOONGARCH implementation of exp
+ *
+ *   Inspired by Intel Approximate Math library, and based on the
+ *   corresponding algorithms of the cephes math library
+ *   Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+ */
+
+/*
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  (this is the zlib license)
+ */
+
+#ifndef LSX_MATHFUN_H
+#define LSX_MATHFUN_H
+
+#include "loongarch_usability.h"
+
+#include <lsxintrin.h>
+
+_LOONGARCH_FLOAT_CONST(c_1, 1.0f);
+_LOONGARCH_FLOAT_CONST(c_2, 2.0f);
+_LOONGARCH_FLOAT_CONST(c_n1, -1.0f);
+_LOONGARCH_FLOAT_CONST(c_0p5, 0.5f);
+
+#define c_inv_mant_mask ~0x7f800000u
+_LOONGARCH_FLOAT_CONST(c_cephes_SQRTHF, 0.707106781186547524);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p0, 7.0376836292E-2);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p1, -1.1514610310E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p2, 1.1676998740E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p3, -1.2420140846E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p4, +1.4249322787E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p5, -1.6668057665E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p6, +2.0000714765E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p7, -2.4999993993E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p8, +3.3333331174E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_q1, -2.12194440e-4);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_q2, 0.693359375);
+
+/* natural logarithm computed for 4 simultaneous float
+ *   return NaN for x <= 0
+ */
+static inline __m128 log_ps(__m128 x)
+{
+    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
+
+    x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(0)); /* force flush to zero on denormal values */
+    __m128i invalid_mask = __lsx_vfcmp_cle_s(x, (__m128)__lsx_vreplgr2vr_w(0));
+
+    __m128i ux = (__m128i)(x);
+
+    __m128i emm0 = __lsx_vsrl_w(ux, __lsx_vreplgr2vr_w(23));
+
+    /* keep only the fractional part */
+    ux = __lsx_vand_v(ux, __lsx_vreplgr2vr_w(c_inv_mant_mask));
+    ux = __lsx_vor_v(ux, __lsx_vreplgr2vr_w(c_0p5.i));
+    x = (__m128)(ux);
+
+    emm0 = __lsx_vsub_w(emm0, __lsx_vreplgr2vr_w(0x7f));
+    __m128 e = __lsx_vffint_s_w(emm0);
+
+    e = __lsx_vfadd_s(e, one);
+
+    /* part2:
+     *     if( x < SQRTHF ) {
+     *       e -= 1;
+     *       x = x + x - 1.0;
+     *     } else { x = x - 1.0; }
+     */
+    __m128i mask = __lsx_vfcmp_clt_s((__m128)x, (__m128)__lsx_vreplgr2vr_w(c_cephes_SQRTHF.i));
+    __m128 tmp = (__m128)(__lsx_vand_v((__m128i)(x), (__m128i)mask));
+    x = __lsx_vfsub_s(x, one);
+    e = __lsx_vfsub_s(e, (__m128)(__lsx_vand_v((__m128i)(one), (__m128i)mask)));
+    x = __lsx_vfadd_s(x, tmp);
+
+    __m128 z = __lsx_vfmul_s(x, x);
+
+    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p0.i);
+
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p1.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p2.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p3.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p4.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p5.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p6.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p7.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p8.i));
+    y = __lsx_vfmul_s(y, x);
+
+    y = __lsx_vfmul_s(y, z);
+
+    tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q1.i));
+    y = __lsx_vfadd_s(y, tmp);
+
+    tmp = __lsx_vfmul_s(z, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
+    y = __lsx_vfsub_s(y, tmp);
+
+    tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q2.i));
+    x = __lsx_vfadd_s(x, y);
+    x = __lsx_vfadd_s(x, tmp);
+    x = (__m128)(__lsx_vor_v((__m128i)(x), (__m128i)invalid_mask)); // negative arg will be NAN
+    return x;
+}
+
+_LOONGARCH_FLOAT_CONST(c_exp_hi, 88.3762626647949f);
+_LOONGARCH_FLOAT_CONST(c_exp_lo, -88.3762626647949f);
+
+_LOONGARCH_FLOAT_CONST(c_cephes_LOG2EF, 1.44269504088896341);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_C1, 0.693359375);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_C2, -2.12194440e-4);
+
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p0, 1.9875691500E-4);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p1, 1.3981999507E-3);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p2, 8.3334519073E-3);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p3, 4.1665795894E-2);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p4, 1.6666665459E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p5, 5.0000001201E-1);
+
+/* exp() computed for 4 float at once */
+static inline __m128 exp_ps(__m128 x)
+{
+    __m128 tmp, fx;
+
+    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
+    x = __lsx_vfmin_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_hi.i));
+    x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_lo.i));
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_LOG2EF.i));
+    fx = __lsx_vfadd_s(fx, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
+
+    /* perform a floorf */
+    tmp = __lsx_vffint_s_w(__lsx_vftint_w_s(fx));
+
+    /* if greater, substract 1 */
+    __m128i mask = __lsx_vfcmp_clt_s(fx, tmp);
+    mask = __lsx_vand_v(mask, (__m128i)one);
+
+    fx = __lsx_vfsub_s(tmp, (__m128)mask);
+
+    tmp = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C1.i));
+    __m128 z = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C2.i));
+    x = __lsx_vfsub_s(x, tmp);
+    x = __lsx_vfsub_s(x, z);
+
+    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p0.i);
+
+    z = __lsx_vfmul_s(x, x);
+
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p1.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p2.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p3.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p4.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p5.i));
+
+    y = __lsx_vfmul_s(y, z);
+    y = __lsx_vfadd_s(y, x);
+    y = __lsx_vfadd_s(y, one);
+
+    /* build 2^n */
+    __m128i mm;
+    mm = __lsx_vftintrz_w_s(fx);
+    mm = __lsx_vadd_w(mm, __lsx_vreplgr2vr_w(0x7f));
+    mm = __lsx_vsll_w(mm, __lsx_vreplgr2vr_w(23));
+
+    y = __lsx_vfmul_s(y, (__m128)mm);
+    return y;
+}
+
+_LOONGARCH_FLOAT_CONST(c_tanh_tiny, 1e-4f);
+_LOONGARCH_FLOAT_CONST(c_tanh_hi, 9.0f);
+// The monomial coefficients of the numerator polynomial (odd).
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_1, 4.89352455891786e-3f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_3, 6.37261928875436e-4f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_5, 1.48572235717979e-5f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_7, 5.12229709037114e-8f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_9, -8.60467152213735e-11f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_11, 2.00018790482477e-13f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_13, -2.76076847742355e-16f);
+// The monomial coefficients of the denominator polynomial (even).
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_0, 4.89352518554385e-3f);
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_2, 2.26843463243900e-3f);
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_4, 1.18534705686654e-4f);
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_6, 1.19825839466702e-6f);
+
+/* tanh() computed for 4 float at once */
+static inline __m128 tanh_ps(__m128 x)
+{
+    __m128 x2 = (__m128)__lsx_vbitclri_w((__m128i)x, 31);
+    __m128i tiny_mask = __lsx_vfcmp_clt_s((__m128)x2, (__m128)(__m128)__lsx_vreplgr2vr_w(c_tanh_tiny.i));
+    __m128i sig_mask = __lsx_vreplgr2vr_w(1 << 31);
+    __m128i sig_save = __lsx_vand_v((__m128i)x, sig_mask);
+
+    // clamp the inputs to the range [-9, 9] since anything outside
+    // this range is -/+1.0f in single-precision.
+    x2 = (__m128)__lsx_vbitsel_v((__m128i)x2, (__m128i)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128i)__lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128)x2));
+
+    // since the polynomials are odd/even, we need x**2.
+    __m128 z = __lsx_vfmul_s(x2, x2);
+
+    // evaluate the numerator polynomial y.
+    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_13.i);
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_11.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_9.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_7.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_5.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_3.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_1.i));
+    y = __lsx_vfmul_s(y, x2);
+
+    // evaluate the denominator polynomial w.
+    __m128 w = (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_6.i);
+    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_4.i));
+    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_2.i));
+    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_0.i));
+
+    // divide the numerator by the denominator.
+    y = __lsx_vfdiv_s(y, w);
+
+    // reinstate the sign.
+    y = (__m128)__lsx_vor_v((__m128i)y, sig_save);
+
+    // when the argument is very small in magnitude it's more accurate to just return it.
+    y = (__m128)__lsx_vbitsel_v((__m128i)y, (__m128i)x, (__m128i)tiny_mask);
+
+    return y;
+}
+
+static inline __m128 pow_ps(__m128 a, __m128 b)
+{
+    // pow(x, m) = exp(m * log(x))
+    return exp_ps(__lsx_vfmul_s(b, log_ps(a)));
+}
+
+static inline __m128 sigmoid_ps(__m128 _v)
+{
+    __m128 _one = __lsx_vreplfr2vr_s(1.f);
+    _v = (__m128)__lsx_vbitrevi_w((__m128i)_v, 31);
+    _v = exp_ps(_v);
+    _v = __lsx_vfadd_s(_v, _one);
+    return __lsx_vfdiv_s(_one, _v);
+}
+
+#endif // LSX_MATHFUN_H
diff --git a/src/layer/loongarch/mish_loongarch.cpp b/src/layer/loongarch/mish_loongarch.cpp
new file mode 100644
index 00000000000..8558e2f8cb0
--- /dev/null
+++ b/src/layer/loongarch/mish_loongarch.cpp
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "mish_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include <math.h>
+
+namespace ncnn {
+
+Mish_loongarch::Mish_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Mish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmul_s(_p, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_p), _one))));
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr * tanh(log(exp(*ptr) + 1.f));
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/mish_loongarch.h b/src/layer/loongarch/mish_loongarch.h
new file mode 100644
index 00000000000..97c6f0520f5
--- /dev/null
+++ b/src/layer/loongarch/mish_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_MISH_LOONGARCH_H
+#define LAYER_MISH_LOONGARCH_H
+
+#include "mish.h"
+
+namespace ncnn {
+
+class Mish_loongarch : virtual public Mish
+{
+public:
+    Mish_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_MISH_LOONGARCH_H
diff --git a/src/layer/loongarch/packing_loongarch.cpp b/src/layer/loongarch/packing_loongarch.cpp
new file mode 100644
index 00000000000..cf68b7b34d6
--- /dev/null
+++ b/src/layer/loongarch/packing_loongarch.cpp
@@ -0,0 +1,569 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "packing_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Packing_loongarch::Packing_loongarch()
+{
+    support_packing = true;
+}
+
+int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    if (elembits == 8)
+        return forward_int8(bottom_blob, top_blob, opt);
+
+    if (use_padding)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    if (elembits != 32)
+    {
+        // non-fp32 type
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    if (elempack == out_elempack)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    bool pack1to4 = elempack == 1 && out_elempack == 4;
+    bool pack4to1 = elempack == 4 && out_elempack == 1;
+
+    if (!pack1to4 && !pack4to1)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+
+    if (!use_padding)
+    {
+        // identity if use_padding not allowed
+        if (dims == 1 && w * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if (dims == 2 && h * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+    }
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        top_blob.w = w * elempack / out_elempack;
+        top_blob.cstep = w * elempack / out_elempack;
+        top_blob.elemsize = elemsize / elempack * out_elempack;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        int outh = h * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < outh; i++)
+            {
+                const float* r0 = bottom_blob.row(i * 4);
+                const float* r1 = bottom_blob.row(i * 4 + 1);
+                const float* r2 = bottom_blob.row(i * 4 + 2);
+                const float* r3 = bottom_blob.row(i * 4 + 3);
+
+                float* outptr = top_blob.row(i);
+
+                int j = 0;
+#if __loongarch_sx
+                for (; j + 3 < w; j += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r3 = __lsx_vld(r3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr, 0);
+                    __lsx_vst(_r0123_1, outptr + 4, 0);
+                    __lsx_vst(_r0123_2, outptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, outptr + 4 * 3, 0);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    outptr += 16;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+
+                    outptr += 4;
+                }
+            }
+        }
+        if (pack4to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const float* r0 = bottom_blob.row(i);
+
+                float* outptr0 = top_blob.row(i * 4);
+                float* outptr1 = top_blob.row(i * 4 + 1);
+                float* outptr2 = top_blob.row(i * 4 + 2);
+                float* outptr3 = top_blob.row(i * 4 + 3);
+
+                int j = 0;
+#if __loongarch_sx
+                for (; j + 3 < w; j += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    r0 += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+
+                    r0 += 4;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+        int size = w * h * d;
+        int outc = channels * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (dims == 3)
+            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        else // if (dims == 4)
+            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const float* r0 = bottom_blob.channel(q * 4);
+                const float* r1 = bottom_blob.channel(q * 4 + 1);
+                const float* r2 = bottom_blob.channel(q * 4 + 2);
+                const float* r3 = bottom_blob.channel(q * 4 + 3);
+
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r3 = __lsx_vld(r3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr, 0);
+                    __lsx_vst(_r0123_1, outptr + 4, 0);
+                    __lsx_vst(_r0123_2, outptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, outptr + 4 * 3, 0);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    outptr += 16;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+
+                    outptr += 4;
+                }
+            }
+        }
+        if (pack4to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* r0 = bottom_blob.channel(q);
+
+                float* outptr0 = top_blob.channel(q * 4);
+                float* outptr1 = top_blob.channel(q * 4 + 1);
+                float* outptr2 = top_blob.channel(q * 4 + 2);
+                float* outptr3 = top_blob.channel(q * 4 + 3);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    r0 += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+
+                    r0 += 4;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    return 0;
+}
+
+int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (use_padding)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    if (elempack == out_elempack)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    bool pack1to8 = elempack == 1 && out_elempack == 8;
+    bool pack8to1 = elempack == 8 && out_elempack == 1;
+
+    if (!pack1to8 && !pack8to1)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+
+    if (!use_padding)
+    {
+        // identity if use_padding not allowed
+        if (dims == 1 && w * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if (dims == 2 && h * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+    }
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        top_blob.w = w * elempack / out_elempack;
+        top_blob.cstep = w * elempack / out_elempack;
+        top_blob.elemsize = elemsize / elempack * out_elempack;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        int outh = h * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to8)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < outh; i++)
+            {
+                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
+                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
+                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
+                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
+                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
+                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
+                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
+                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
+
+                signed char* outptr = top_blob.row<signed char>(i);
+
+                int j = 0;
+                for (; j < w; j++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+                    outptr[4] = *r4++;
+                    outptr[5] = *r5++;
+                    outptr[6] = *r6++;
+                    outptr[7] = *r7++;
+
+                    outptr += 8;
+                }
+            }
+        }
+        if (pack8to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const signed char* r0 = bottom_blob.row<const signed char>(i);
+
+                signed char* outptr0 = top_blob.row<signed char>(i * 8);
+                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
+                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
+                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
+                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
+                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
+                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
+                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
+
+                int j = 0;
+                for (; j < w; j++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+                    *outptr4++ = r0[4];
+                    *outptr5++ = r0[5];
+                    *outptr6++ = r0[6];
+                    *outptr7++ = r0[7];
+
+                    r0 += 8;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+        int size = w * h * d;
+        int outc = channels * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (dims == 3)
+            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        else // if (dims == 4)
+            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to8)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q * 8);
+                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
+                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
+                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
+                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
+                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
+                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
+                const signed char* r7 = bottom_blob.channel(q * 8 + 7);
+
+                signed char* outptr = top_blob.channel(q);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+                    outptr[4] = *r4++;
+                    outptr[5] = *r5++;
+                    outptr[6] = *r6++;
+                    outptr[7] = *r7++;
+
+                    outptr += 8;
+                }
+            }
+        }
+        if (pack8to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q);
+
+                signed char* outptr0 = top_blob.channel(q * 8);
+                signed char* outptr1 = top_blob.channel(q * 8 + 1);
+                signed char* outptr2 = top_blob.channel(q * 8 + 2);
+                signed char* outptr3 = top_blob.channel(q * 8 + 3);
+                signed char* outptr4 = top_blob.channel(q * 8 + 4);
+                signed char* outptr5 = top_blob.channel(q * 8 + 5);
+                signed char* outptr6 = top_blob.channel(q * 8 + 6);
+                signed char* outptr7 = top_blob.channel(q * 8 + 7);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+                    *outptr4++ = r0[4];
+                    *outptr5++ = r0[5];
+                    *outptr6++ = r0[6];
+                    *outptr7++ = r0[7];
+
+                    r0 += 8;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/packing_loongarch.h b/src/layer/loongarch/packing_loongarch.h
new file mode 100644
index 00000000000..1db215cfee7
--- /dev/null
+++ b/src/layer/loongarch/packing_loongarch.h
@@ -0,0 +1,35 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PACKING_LOONGARCH_H
+#define LAYER_PACKING_LOONGARCH_H
+
+#include "packing.h"
+
+namespace ncnn {
+
+class Packing_loongarch : virtual public Packing
+{
+public:
+    Packing_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PACKING_LOONGARCH_H
diff --git a/src/layer/loongarch/padding_loongarch.cpp b/src/layer/loongarch/padding_loongarch.cpp
new file mode 100644
index 00000000000..1f345ce6053
--- /dev/null
+++ b/src/layer/loongarch/padding_loongarch.cpp
@@ -0,0 +1,385 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "padding_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#if __loongarch_sx
+#include "padding_pack4.h"
+#include "padding_pack8_int8.h"
+#endif // __loongarch_sx
+
+Padding_loongarch::Padding_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Padding_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int elembits = bottom_blob.elembits();
+
+    if (elembits == 8)
+        return forward_int8(bottom_blob, top_blob, opt);
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int outw = w * elempack + left + right;
+
+            int out_elempack = outw % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (left % 4 == 0 && out_elempack == 4 && type == 0)
+            {
+                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                __m128 pad_value = __lsx_vreplfr2vr_s(value);
+                padding_constant_pack4_lsx(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int outw = w + left + right;
+            int outh = h * elempack + top + bottom;
+
+            int out_elempack = outh % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (top % 4 == 0 && out_elempack == 4 && type == 0)
+            {
+                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                __m128 pad_value = __lsx_vreplfr2vr_s(value);
+                padding_constant_pack4_lsx(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outc = channels * elempack + front + behind;
+
+            int out_elempack = outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
+            {
+                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int front_ = front / elempack;
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < outc / out_elempack; q++)
+                {
+                    Mat borderm = top_blob.channel(q);
+
+                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
+                    //Channel padding
+                    if ((q - front_) < 0 || (q - front_) >= channels)
+                    {
+                        borderm.fill(pad_value);
+                    }
+                    else
+                    {
+                        const Mat m = bottom_blob.channel(q - front_);
+                        if (type == 0)
+                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        if (type == 1)
+                            padding_replicate_pack4_lsx(m, borderm, top, bottom, left, right);
+                        if (type == 2)
+                            padding_reflect_pack4_lsx(m, borderm, top, bottom, left, right);
+                    }
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outd = d + front + behind;
+
+            if (type == 0)
+            {
+                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
+
+                    for (int z = 0; z < outd; z++)
+                    {
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        // depth padding
+                        if ((z - front) < 0 || (z - front) >= d)
+                        {
+                            borderm.fill(pad_value);
+                        }
+                        else
+                        {
+                            const Mat m = bottom_blob.channel(q).depth(z - front);
+                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        }
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    Mat top_blob_unpacked;
+    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
+    if (ret != 0)
+        return ret;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = top_blob_unpacked.c % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+
+    return 0;
+}
+
+int Padding_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int outw = w * elempack + left + right;
+
+            int out_elempack = outw % 8 == 0 ? 8 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (left % 8 == 0 && out_elempack == 8 && type == 0)
+            {
+                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int64_t v8 = (int64_t)value;
+                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int outw = w + left + right;
+            int outh = h * elempack + top + bottom;
+
+            int out_elempack = outh % 8 == 0 ? 8 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (top % 8 == 0 && out_elempack == 8 && type == 0)
+            {
+                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int64_t v8 = (int64_t)value;
+                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outc = channels * elempack + front + behind;
+
+            int out_elempack = outc % 8 == 0 ? 8 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
+            {
+                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int front_ = front / elempack;
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < outc / out_elempack; q++)
+                {
+                    Mat borderm = top_blob.channel(q);
+
+                    // TODO perchannel
+                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
+                    int64_t v8 = (int64_t)value;
+                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+
+                    //Channel padding
+                    if ((q - front_) < 0 || (q - front_) >= channels)
+                    {
+                        borderm.fill(pad_value);
+                    }
+                    else
+                    {
+                        const Mat m = bottom_blob.channel(q - front_);
+                        if (type == 0)
+                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        if (type == 1)
+                            padding_replicate_pack8_int8_lsx(m, borderm, top, bottom, left, right);
+                        if (type == 2)
+                            padding_reflect_pack8_int8_lsx(m, borderm, top, bottom, left, right);
+                    }
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outd = d + front + behind;
+
+            if (type == 0)
+            {
+                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    // TODO perchannel
+                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
+                    int64_t v8 = (int64_t)value;
+                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+
+                    for (int z = 0; z < outd; z++)
+                    {
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        // depth padding
+                        if ((z - front) < 0 || (z - front) >= d)
+                        {
+                            borderm.fill(pad_value);
+                        }
+                        else
+                        {
+                            const Mat m = bottom_blob.channel(q).depth(z - front);
+                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        }
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    Mat top_blob_unpacked;
+    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
+    if (ret != 0)
+        return ret;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = top_blob_unpacked.c % 8 == 0 ? 8 : 1;
+    }
+#endif
+
+    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/padding_loongarch.h b/src/layer/loongarch/padding_loongarch.h
new file mode 100644
index 00000000000..137fbc4459e
--- /dev/null
+++ b/src/layer/loongarch/padding_loongarch.h
@@ -0,0 +1,35 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PADDING_LOONGARCH_H
+#define LAYER_PADDING_LOONGARCH_H
+
+#include "padding.h"
+
+namespace ncnn {
+
+class Padding_loongarch : virtual public Padding
+{
+public:
+    Padding_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PADDING_LOONGARCH_H
diff --git a/src/layer/loongarch/padding_pack4.h b/src/layer/loongarch/padding_pack4.h
new file mode 100644
index 00000000000..d040ce778b5
--- /dev/null
+++ b/src/layer/loongarch/padding_pack4.h
@@ -0,0 +1,213 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void padding_constant_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v)
+{
+    const float* ptr = src;
+    float* outptr = dst;
+    int top_size = top * dst.w;
+    int bottom_size = bottom * dst.w;
+
+    // fill top
+    for (int y = 0; y < top_size; y++)
+    {
+        __lsx_vst(v, outptr, 0);
+        outptr += 4;
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(v, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __builtin_prefetch(ptr + 32);
+            __lsx_vst(__lsx_vld(ptr, 0), outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(v, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill top
+    for (int y = 0; y < bottom_size; y++)
+    {
+        __lsx_vst(v, outptr, 0);
+        outptr += 4;
+    }
+}
+
+static void padding_replicate_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const float* ptr = src;
+    float* outptr = dst;
+
+    // fill top
+    for (int y = 0; y < top; y++)
+    {
+        const float* ptr0 = ptr;
+        __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            _p = (__m128)__lsx_vld(ptr, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill bottom
+    ptr -= src.w * 4;
+    for (int y = 0; y < bottom; y++)
+    {
+        const float* ptr0 = ptr;
+        __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+}
+
+static void padding_reflect_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const float* ptr = src;
+    float* outptr = dst;
+
+    // fill top
+    ptr += top * src.w * 4;
+    for (int y = 0; y < top; y++)
+    {
+        const float* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        ptr -= src.w * 4;
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr + (left - x) * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr - 8 - x * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill bottom
+    ptr -= 2 * src.w * 4;
+    for (int y = 0; y < bottom; y++)
+    {
+        const float* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        ptr -= src.w * 4;
+    }
+}
diff --git a/src/layer/loongarch/padding_pack8_int8.h b/src/layer/loongarch/padding_pack8_int8.h
new file mode 100644
index 00000000000..4c6586c6ae2
--- /dev/null
+++ b/src/layer/loongarch/padding_pack8_int8.h
@@ -0,0 +1,171 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void padding_constant_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int64_t _v)
+{
+    const int64_t* ptr = src;
+    int64_t* outptr = dst;
+
+    // fill top
+    for (int y = 0; y < top; y++)
+    {
+        for (int x = 0; x < dst.w; x++)
+        {
+            *outptr++ = _v;
+        }
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = _v;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = _v;
+        }
+    }
+    // fill bottom
+    for (int y = 0; y < bottom; y++)
+    {
+        for (int x = 0; x < dst.w; x++)
+        {
+            *outptr++ = _v;
+        }
+    }
+}
+
+static void padding_replicate_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const int64_t* ptr = src;
+    int64_t* outptr = dst;
+
+    // fill top
+    for (int y = 0; y < top; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = *ptr0;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-1];
+        }
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = *ptr;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr[-1];
+        }
+    }
+    // fill bottom
+    ptr -= src.w;
+    for (int y = 0; y < bottom; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = *ptr0;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-1];
+        }
+    }
+}
+
+static void padding_reflect_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const int64_t* ptr = src;
+    int64_t* outptr = dst;
+
+    // fill top
+    ptr += top * src.w;
+    for (int y = 0; y < top; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = ptr0[left - x];
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-2 - x];
+        }
+        ptr -= src.w;
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = ptr[left - x];
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr[-2 - x];
+        }
+    }
+    // fill bottom
+    ptr -= 2 * src.w;
+    for (int y = 0; y < bottom; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = ptr0[left - x];
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-2 - x];
+        }
+        ptr -= src.w;
+    }
+}
diff --git a/src/layer/loongarch/pooling_loongarch.cpp b/src/layer/loongarch/pooling_loongarch.cpp
new file mode 100644
index 00000000000..9d988971324
--- /dev/null
+++ b/src/layer/loongarch/pooling_loongarch.cpp
@@ -0,0 +1,291 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pooling_loongarch.h"
+
+#include <float.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Pooling_loongarch::Pooling_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Pooling_loongarch::create_pipeline(const Option& /*opt*/)
+{
+    if (adaptive_pooling)
+    {
+        support_packing = false;
+
+        support_bf16_storage = false;
+        support_fp16_storage = false;
+        support_int8_storage = false;
+        support_tensor_storage = false;
+    }
+    return 0;
+}
+
+int Pooling_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (adaptive_pooling)
+    {
+        return Pooling::forward(bottom_blob, top_blob, opt);
+    }
+
+    // max value in NxN window
+    // avg value in NxN window
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
+
+    if (elempack == 4)
+    {
+        if (global_pooling)
+        {
+            top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            int size = w * h;
+
+            if (pooling_type == PoolMethod_MAX)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob.channel(q);
+
+                    __m128 _max = (__m128)__lsx_vld(ptr, 0);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __m128 _val = (__m128)__lsx_vld(ptr, 0);
+                        _max = __lsx_vfmax_s(_max, _val);
+                        ptr += 4;
+                    }
+
+                    float* outptr = top_blob;
+                    __lsx_vst(_max, outptr + q * 4, 0);
+                }
+            }
+            else if (pooling_type == PoolMethod_AVE)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob.channel(q);
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __m128 _val = (__m128)__lsx_vld(ptr, 0);
+                        _sum = __lsx_vfadd_s(_sum, _val);
+                        ptr += 4;
+                    }
+
+                    __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / size));
+
+                    float* outptr = top_blob;
+                    __lsx_vst(_avg, outptr + q * 4, 0);
+                }
+            }
+
+            return 0;
+        }
+
+        Mat bottom_blob_bordered;
+        make_padding(bottom_blob, bottom_blob_bordered, opt);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+
+        int outw = (w - kernel_w) / stride_w + 1;
+        int outh = (h - kernel_h) / stride_h + 1;
+
+        top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int maxk = kernel_w * kernel_h;
+
+        // kernel offsets
+        std::vector<int> _space_ofs(maxk);
+        int* space_ofs = &_space_ofs[0];
+        {
+            int p1 = 0;
+            int p2 = 0;
+            int gap = w - kernel_w;
+            for (int i = 0; i < kernel_h; i++)
+            {
+                for (int j = 0; j < kernel_w; j++)
+                {
+                    space_ofs[p1] = p2;
+                    p1++;
+                    p2++;
+                }
+                p2 += gap;
+            }
+        }
+
+        if (pooling_type == PoolMethod_MAX)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat m = bottom_blob_bordered.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                        __m128 _max = (__m128)__lsx_vld(sptr, 0);
+
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                            _max = __lsx_vfmax_s(_max, _val);
+                        }
+
+                        __lsx_vst(_max, outptr + j * 4, 0);
+                    }
+
+                    outptr += outw * 4;
+                }
+            }
+        }
+        else if (pooling_type == PoolMethod_AVE)
+        {
+            if (avgpool_count_include_pad == 0)
+            {
+                int wtailpad = 0;
+                int htailpad = 0;
+
+                if (pad_mode == 0) // full padding
+                {
+                    wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
+                    htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int sy0 = i * stride_h;
+
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sx0 = j * stride_w;
+
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                            int area = 0;
+
+                            for (int ki = 0; ki < kernel_h; ki++)
+                            {
+                                int sy = sy0 + ki;
+
+                                if (sy < pad_top)
+                                    continue;
+
+                                if (sy >= h - pad_bottom - htailpad)
+                                    break;
+
+                                for (int kj = 0; kj < kernel_w; kj++)
+                                {
+                                    int sx = sx0 + kj;
+
+                                    if (sx < pad_left)
+                                        continue;
+
+                                    if (sx >= w - pad_right - wtailpad)
+                                        break;
+
+                                    __m128 _val = (__m128)__lsx_vld(m.row(sy) + sx * 4, 0);
+                                    _sum = __lsx_vfadd_s(_sum, _val);
+                                    area += 1;
+                                }
+                            }
+
+                            __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / area));
+                            __lsx_vst(_avg, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+            else // if (avgpool_count_include_pad == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    const float inv_maxk = 1.f / maxk;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                                _sum = __lsx_vfadd_s(_sum, _val);
+                            }
+
+                            __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(inv_maxk));
+                            __lsx_vst(_avg, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    return Pooling::forward(bottom_blob, top_blob, opt);
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/pooling_loongarch.h b/src/layer/loongarch/pooling_loongarch.h
new file mode 100644
index 00000000000..97e0c9ff2f7
--- /dev/null
+++ b/src/layer/loongarch/pooling_loongarch.h
@@ -0,0 +1,33 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_POOLING_LOONGARCH_H
+#define LAYER_POOLING_LOONGARCH_H
+
+#include "pooling.h"
+
+namespace ncnn {
+
+class Pooling_loongarch : virtual public Pooling
+{
+public:
+    Pooling_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_POOLING_LOONGARCH_H
diff --git a/src/layer/loongarch/prelu_loongarch.cpp b/src/layer/loongarch/prelu_loongarch.cpp
new file mode 100644
index 00000000000..27cc0bc9d44
--- /dev/null
+++ b/src/layer/loongarch/prelu_loongarch.cpp
@@ -0,0 +1,193 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "prelu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+PReLU_loongarch::PReLU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int PReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        int w = bottom_top_blob.w * elempack;
+
+#if __loongarch_sx
+        int nn_w = w / 4;
+        int remain_w_start = nn_w * 4;
+#else
+        int remain_w_start = 0;
+#endif // __loongarch_sx
+
+        float* ptr = bottom_top_blob;
+
+        if (num_slope > 1)
+        {
+            const float* slope = slope_data;
+
+#if __loongarch_sx
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < nn_w; i++)
+            {
+                float* ptr0 = ptr + i * 4;
+
+                __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+                __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _slope = (__m128)__lsx_vld(slope + i * 4, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr0, 0);
+            }
+#endif // __loongarch_sx
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = remain_w_start; i < w; i++)
+            {
+                float v = ptr[i];
+                if (v < 0.f)
+                    ptr[i] = v * slope[i];
+            }
+        }
+        else
+        {
+            const float slope = slope_data[0];
+
+#if __loongarch_sx
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < nn_w; i++)
+            {
+                float* ptr0 = ptr + i * 4;
+
+                __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+                __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr0, 0);
+            }
+#endif // __loongarch_sx
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = remain_w_start; i < w; i++)
+            {
+                float v = ptr[i];
+                if (v < 0.f)
+                    ptr[i] = v * slope;
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w * elempack;
+        int h = bottom_top_blob.h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+
+            const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
+
+            int j = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope);
+
+            for (; j + 3 < w; j += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; j < w; j++)
+            {
+                float v = *ptr;
+                if (v < 0.f)
+                    *ptr = v * slope;
+
+                ptr++;
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h * elempack;
+
+        const float* slope_data_ptr = slope_data;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope);
+
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                if (*ptr < 0)
+                    *ptr *= slope;
+
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/prelu_loongarch.h b/src/layer/loongarch/prelu_loongarch.h
new file mode 100644
index 00000000000..97031bb0601
--- /dev/null
+++ b/src/layer/loongarch/prelu_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PRELU_LOONGARCH_H
+#define LAYER_PRELU_LOONGARCH_H
+
+#include "prelu.h"
+
+namespace ncnn {
+
+class PReLU_loongarch : virtual public PReLU
+{
+public:
+    PReLU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PRELU_LOONGARCH_H
diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp
new file mode 100644
index 00000000000..657ff2d06bf
--- /dev/null
+++ b/src/layer/loongarch/quantize_loongarch.cpp
@@ -0,0 +1,494 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "quantize_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Quantize_loongarch::Quantize_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale);
+                    outptr[1] = float2int8(ptr0[1] * scale);
+                    outptr[2] = float2int8(ptr0[2] * scale);
+                    outptr[3] = float2int8(ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
+                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + i * 8, 0);
+                        __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale0);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale1);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        const float s0 = scale_data[i * 4];
+                        const float s1 = scale_data[i * 4 + 1];
+                        const float s2 = scale_data[i * 4 + 2];
+                        const float s3 = scale_data[i * 4 + 3];
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        int i = 0;
+                        for (; i + 1 < size; i += 2)
+                        {
+                            __builtin_prefetch(ptr0 + 32);
+                            __builtin_prefetch(ptr1 + 32);
+                            __m128 _v0 = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _v1 = (__m128)__lsx_vld(ptr0 + 4, 0);
+                            __m128 _v2 = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _v3 = (__m128)__lsx_vld(ptr1 + 4, 0);
+                            _v0 = __lsx_vfmul_s(_v0, _scale);
+                            _v1 = __lsx_vfmul_s(_v1, _scale);
+                            _v2 = __lsx_vfmul_s(_v2, _scale);
+                            _v3 = __lsx_vfmul_s(_v3, _scale);
+                            *((int64_t*)outptr) = float2int8(_v0, _v2);
+                            *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3);
+
+                            ptr0 += 8;
+                            ptr1 += 8;
+                            outptr += 16;
+                        }
+                        for (; i < size; i++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + q * 8, 0);
+                        __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0);
+
+                        int i = 0;
+                        for (; i < size; i++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale0);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale1);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        const float s0 = scale_data[q * 4];
+                        const float s1 = scale_data[q * 4 + 1];
+                        const float s2 = scale_data[q * 4 + 2];
+                        const float s3 = scale_data[q * 4 + 3];
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const float* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const float* ptr0 = bottom_blob.row(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8(*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+            for (; i + 15 < size; i += 16)
+            {
+                __builtin_prefetch(ptr + 64);
+                __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
+                __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
+                __m128 _v2 = (__m128)__lsx_vld(ptr + 8, 0);
+                __m128 _v3 = (__m128)__lsx_vld(ptr + 12, 0);
+                _v0 = __lsx_vfmul_s(_v0, _scale);
+                _v1 = __lsx_vfmul_s(_v1, _scale);
+                _v2 = __lsx_vfmul_s(_v2, _scale);
+                _v3 = __lsx_vfmul_s(_v3, _scale);
+                *((int64_t*)outptr) = float2int8(_v0, _v1);
+                *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3);
+
+                ptr += 16;
+                outptr += 16;
+            }
+            for (; i + 7 < size; i += 8)
+            {
+                __builtin_prefetch(ptr + 32);
+                __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
+                __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
+                _v0 = __lsx_vfmul_s(_v0, _scale);
+                _v1 = __lsx_vfmul_s(_v1, _scale);
+                *((int64_t*)outptr) = float2int8(_v0, _v1);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/quantize_loongarch.h b/src/layer/loongarch/quantize_loongarch.h
new file mode 100644
index 00000000000..cae04aab171
--- /dev/null
+++ b/src/layer/loongarch/quantize_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_QUANTIZE_LOONGARCH_H
+#define LAYER_QUANTIZE_LOONGARCH_H
+
+#include "quantize.h"
+
+namespace ncnn {
+
+class Quantize_loongarch : virtual public Quantize
+{
+public:
+    Quantize_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_QUANTIZE_LOONGARCH_H
diff --git a/src/layer/loongarch/relu_loongarch.cpp b/src/layer/loongarch/relu_loongarch.cpp
new file mode 100644
index 00000000000..eb478d3ae9b
--- /dev/null
+++ b/src/layer/loongarch/relu_loongarch.cpp
@@ -0,0 +1,98 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "relu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+ReLU_loongarch::ReLU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int ReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        if (slope == 0.f)
+        {
+            int i = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmax_s(_p, _zero);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                if (*ptr < 0)
+                    *ptr = 0;
+                ptr++;
+            }
+        }
+        else
+        {
+            int i = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                if (*ptr < 0)
+                    *ptr *= slope;
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/relu_loongarch.h b/src/layer/loongarch/relu_loongarch.h
new file mode 100644
index 00000000000..445c6e8febc
--- /dev/null
+++ b/src/layer/loongarch/relu_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RELU_LOONGARCH_H
+#define LAYER_RELU_LOONGARCH_H
+
+#include "relu.h"
+
+namespace ncnn {
+
+class ReLU_loongarch : virtual public ReLU
+{
+public:
+    ReLU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RELU_LOONGARCH_H
diff --git a/src/layer/loongarch/requantize_leakyrelu_pack4.h b/src/layer/loongarch/requantize_leakyrelu_pack4.h
new file mode 100644
index 00000000000..d6b49942660
--- /dev/null
+++ b/src/layer/loongarch/requantize_leakyrelu_pack4.h
@@ -0,0 +1,271 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(leakyrelu(v * scale_in, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out), slope)
+
+    // int8(leakyrelu(v * scale_in + bias, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope)
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmul_s(_v00, _scale0);
+                    _v01 = __lsx_vfmul_s(_v01, _scale0);
+                    _v02 = __lsx_vfmul_s(_v02, _scale0);
+                    _v03 = __lsx_vfmul_s(_v03, _scale0);
+                    _v10 = __lsx_vfmul_s(_v10, _scale1);
+                    _v11 = __lsx_vfmul_s(_v11, _scale1);
+                    _v12 = __lsx_vfmul_s(_v12, _scale1);
+                    _v13 = __lsx_vfmul_s(_v13, _scale1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope);
+                    *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmul_s(_v0, _scale0);
+                    _v1 = __lsx_vfmul_s(_v1, _scale1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+                _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+                _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0);
+                    _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1);
+                    _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope);
+                    *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    __builtin_prefetch(intptr0 + 32);
+                    __builtin_prefetch(intptr1 + 32);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope);
+                    *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                    _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __m128i v = float2int8leakyrelu(_v, _slope);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+                __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+                _bias = __lsx_vfmul_s(_bias, _scale_out);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __m128i v = float2int8leakyrelu(_v, _slope);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/requantize_leakyrelu_pack8.h b/src/layer/loongarch/requantize_leakyrelu_pack8.h
new file mode 100644
index 00000000000..a2c4faed4f2
--- /dev/null
+++ b/src/layer/loongarch/requantize_leakyrelu_pack8.h
@@ -0,0 +1,188 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(leakyrelu(v * scale_in, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out), slope)
+
+    // int8(leakyrelu(v * scale_in + bias, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope)
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+            __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                _v4 = __lsx_vfmul_s(_v4, _scale0);
+                _v5 = __lsx_vfmul_s(_v5, _scale1);
+                _v6 = __lsx_vfmul_s(_v6, _scale0);
+                _v7 = __lsx_vfmul_s(_v7, _scale1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+                *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope);
+                *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+            __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+            __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+            _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+            _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+            __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0);
+                _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1);
+                _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0);
+                _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+                *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope);
+                *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/requantize_loongarch.cpp b/src/layer/loongarch/requantize_loongarch.cpp
new file mode 100644
index 00000000000..556d20de4f6
--- /dev/null
+++ b/src/layer/loongarch/requantize_loongarch.cpp
@@ -0,0 +1,1386 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#if __loongarch_sx
+#include "requantize_leakyrelu_pack4.h"
+#include "requantize_leakyrelu_pack8.h"
+#include "requantize_relu_pack4.h"
+#include "requantize_relu_pack8.h"
+#endif // __loongarch_sx
+
+Requantize_loongarch::Requantize_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Requantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                            _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmul_s(_v, _scale_in);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                            _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmul_s(_v, _scale_in);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+                        __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        signed char* ptr = top_blob;
+
+        if (scale_in_data_size == 1 && scale_out_data_size == 1)
+        {
+            const float scale_in = scale_in_data[0];
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+        {
+            const float scale_in = scale_in_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+        else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+        {
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in + bias;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/requantize_loongarch.h b/src/layer/loongarch/requantize_loongarch.h
new file mode 100644
index 00000000000..8175989959e
--- /dev/null
+++ b/src/layer/loongarch/requantize_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_LOONGARCH_H
+#define LAYER_REQUANTIZE_LOONGARCH_H
+
+#include "requantize.h"
+
+namespace ncnn {
+
+class Requantize_loongarch : virtual public Requantize
+{
+public:
+    Requantize_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_LOONGARCH_H
diff --git a/src/layer/loongarch/requantize_relu_pack4.h b/src/layer/loongarch/requantize_relu_pack4.h
new file mode 100644
index 00000000000..2fba8dfc2e4
--- /dev/null
+++ b/src/layer/loongarch/requantize_relu_pack4.h
@@ -0,0 +1,267 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmul_s(_v00, _scale0);
+                    _v01 = __lsx_vfmul_s(_v01, _scale0);
+                    _v02 = __lsx_vfmul_s(_v02, _scale0);
+                    _v03 = __lsx_vfmul_s(_v03, _scale0);
+                    _v10 = __lsx_vfmul_s(_v10, _scale1);
+                    _v11 = __lsx_vfmul_s(_v11, _scale1);
+                    _v12 = __lsx_vfmul_s(_v12, _scale1);
+                    _v13 = __lsx_vfmul_s(_v13, _scale1);
+                    *((int64_t*)ptr) = float2int8relu(_v00, _v10);
+                    *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11);
+                    *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12);
+                    *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmul_s(_v0, _scale0);
+                    _v1 = __lsx_vfmul_s(_v1, _scale1);
+                    *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+                _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+                _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0);
+                    _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1);
+                    _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1);
+                    *((int64_t*)ptr) = float2int8relu(_v00, _v10);
+                    *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11);
+                    *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12);
+                    *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    __builtin_prefetch(intptr0 + 32);
+                    __builtin_prefetch(intptr1 + 32);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    *((int64_t*)ptr) = float2int8relu(_v00, _v10);
+                    *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                    _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                    *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __m128i v = float2int8relu(_v);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+                __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+                _bias = __lsx_vfmul_s(_bias, _scale_out);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __m128i v = float2int8relu(_v);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/requantize_relu_pack8.h b/src/layer/loongarch/requantize_relu_pack8.h
new file mode 100644
index 00000000000..3d2a45b45d0
--- /dev/null
+++ b/src/layer/loongarch/requantize_relu_pack8.h
@@ -0,0 +1,186 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                _v4 = __lsx_vfmul_s(_v4, _scale0);
+                _v5 = __lsx_vfmul_s(_v5, _scale1);
+                _v6 = __lsx_vfmul_s(_v6, _scale0);
+                _v7 = __lsx_vfmul_s(_v7, _scale1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+                *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5);
+                *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+            __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+            __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+            _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+            _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0);
+                _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1);
+                _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0);
+                _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+                *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5);
+                *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/sigmoid_loongarch.cpp b/src/layer/loongarch/sigmoid_loongarch.cpp
new file mode 100644
index 00000000000..6d112804f26
--- /dev/null
+++ b/src/layer/loongarch/sigmoid_loongarch.cpp
@@ -0,0 +1,76 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "sigmoid_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+#include <math.h>
+
+namespace ncnn {
+
+Sigmoid_loongarch::Sigmoid_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Sigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = (__m128)__lsx_vbitrevi_w((__m128i)_p, 31);
+            _p = exp_ps(_p);
+            _p = __lsx_vfadd_s(_p, _one);
+            __m128 _outp = __lsx_vfdiv_s(_one, _p);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = 1.f / (1.f + exp(-*ptr));
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/sigmoid_loongarch.h b/src/layer/loongarch/sigmoid_loongarch.h
new file mode 100644
index 00000000000..b15aad235db
--- /dev/null
+++ b/src/layer/loongarch/sigmoid_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SIGMOID_LOONGARCH_H
+#define LAYER_SIGMOID_LOONGARCH_H
+
+#include "sigmoid.h"
+
+namespace ncnn {
+
+class Sigmoid_loongarch : virtual public Sigmoid
+{
+public:
+    Sigmoid_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SIGMOID_LOONGARCH_H
diff --git a/src/layer/loongarch/slice_loongarch.cpp b/src/layer/loongarch/slice_loongarch.cpp
new file mode 100644
index 00000000000..edd8656a4bb
--- /dev/null
+++ b/src/layer/loongarch/slice_loongarch.cpp
@@ -0,0 +1,371 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "slice_loongarch.h"
+
+namespace ncnn {
+
+Slice_loongarch::Slice_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Slice_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    const int* slices_ptr = slices;
+    int positive_axis = axis < 0 ? dims + axis : axis;
+
+    if (dims == 1) // positive_axis == 0
+    {
+        // slice vector
+        int w = bottom_blob.w * elempack;
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (w - q) / (top_blobs.size() - i);
+            }
+
+            int out_elempack = 1;
+#if __loongarch_sx
+            if (opt.use_packing_layout)
+                out_elempack = slice % 4 == 0 ? 4 : 1;
+#endif
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            const float* ptr = (const float*)bottom_blob + q;
+            float* outptr = top_blob;
+            memcpy(outptr, ptr, top_blob.w * top_blob.elemsize);
+
+            q += slice;
+        }
+    }
+
+    if (dims == 2 && positive_axis == 0)
+    {
+        // slice image height
+        int w = bottom_blob.w;
+        int h = bottom_blob.h * elempack;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (h - q) / (top_blobs.size() - i);
+            }
+
+            int out_elempack = 1;
+#if __loongarch_sx
+            if (opt.use_packing_layout)
+                out_elempack = slice % 4 == 0 ? 4 : 1;
+#endif
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        size_t out_elemsize = top_blobs[0].elemsize;
+        int out_elempack = top_blobs[0].elempack;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
+            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
+        }
+
+        Mat bottom_blob_unpacked = bottom_blob;
+        if (elempack > out_elempack)
+        {
+            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
+        }
+
+        const float* ptr = bottom_blob_unpacked;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            Mat& top_blob = top_blobs[i];
+
+            if (out_elempack == 1 && top_blob.elempack == 4)
+            {
+                for (int j = 0; j < top_blob.h; j++)
+                {
+                    const float* r0 = ptr;
+                    const float* r1 = ptr + w;
+                    const float* r2 = ptr + w * 2;
+                    const float* r3 = ptr + w * 3;
+
+                    float* outptr0 = top_blob.row(j);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        outptr0[0] = *r0++;
+                        outptr0[1] = *r1++;
+                        outptr0[2] = *r2++;
+                        outptr0[3] = *r3++;
+
+                        outptr0 += 4;
+                    }
+
+                    ptr += w * 4;
+                }
+            }
+            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
+            {
+                int size = w * top_blob.h;
+
+                float* outptr = top_blob;
+                memcpy(outptr, ptr, size * top_blob.elemsize);
+
+                ptr += size * top_blob.elempack;
+            }
+        }
+    }
+
+    if (dims == 2 && positive_axis == 1)
+    {
+        // slice image width
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (w - q) / (top_blobs.size() - i);
+            }
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+            const float* ptr = bottom_blob.row(j);
+            for (size_t i = 0; i < top_blobs.size(); i++)
+            {
+                Mat& top_blob = top_blobs[i];
+
+                float* outptr = top_blob.row(j);
+                memcpy(outptr, ptr, top_blob.w * elemsize);
+
+                ptr += top_blob.w * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 0)
+    {
+        // slice dim channel
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c * elempack;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (channels - q) / (top_blobs.size() - i);
+            }
+
+            int out_elempack = 1;
+#if __loongarch_sx
+            if (opt.use_packing_layout)
+                out_elempack = slice % 4 == 0 ? 4 : 1;
+#endif
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        size_t out_elemsize = top_blobs[0].elemsize;
+        int out_elempack = top_blobs[0].elempack;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
+            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
+        }
+
+        Mat bottom_blob_unpacked = bottom_blob;
+        if (elempack > out_elempack)
+        {
+            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
+        }
+
+        int p = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            Mat& top_blob = top_blobs[i];
+
+            if (out_elempack == 1 && top_blob.elempack == 4)
+            {
+                int size = top_blob.w * top_blob.h;
+
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    const float* r0 = bottom_blob_unpacked.channel(p);
+                    const float* r1 = bottom_blob_unpacked.channel(p + 1);
+                    const float* r2 = bottom_blob_unpacked.channel(p + 2);
+                    const float* r3 = bottom_blob_unpacked.channel(p + 3);
+
+                    float* outptr0 = top_blob.channel(q);
+
+                    for (int j = 0; j < size; j++)
+                    {
+                        outptr0[0] = *r0++;
+                        outptr0[1] = *r1++;
+                        outptr0[2] = *r2++;
+                        outptr0[3] = *r3++;
+
+                        outptr0 += 4;
+                    }
+
+                    p += 4;
+                }
+            }
+            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
+            {
+                int size = top_blob.total();
+
+                const float* ptr = bottom_blob_unpacked.channel(p);
+                float* outptr = top_blob;
+                memcpy(outptr, ptr, size * top_blob.elemsize);
+
+                p += top_blob.c;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 1)
+    {
+        // slice dim height
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (h - q) / (top_blobs.size() - i);
+            }
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < channels; p++)
+        {
+            const float* ptr = bottom_blob.channel(p);
+
+            for (size_t i = 0; i < top_blobs.size(); i++)
+            {
+                Mat& top_blob = top_blobs[i];
+
+                int size = top_blob.w * top_blob.h;
+
+                float* outptr = top_blob.channel(p);
+                memcpy(outptr, ptr, size * elemsize);
+
+                ptr += size * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 2)
+    {
+        // slice dim width
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (w - q) / (top_blobs.size() - i);
+            }
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < channels; p++)
+        {
+            const float* ptr = bottom_blob.channel(p);
+
+            for (int j = 0; j < h; j++)
+            {
+                for (size_t i = 0; i < top_blobs.size(); i++)
+                {
+                    Mat& top_blob = top_blobs[i];
+
+                    float* outptr = top_blob.channel(p).row(j);
+                    memcpy(outptr, ptr, top_blob.w * elemsize);
+
+                    ptr += top_blob.w * elempack;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/slice_loongarch.h b/src/layer/loongarch/slice_loongarch.h
new file mode 100644
index 00000000000..b42138ba418
--- /dev/null
+++ b/src/layer/loongarch/slice_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SLICE_LOONGARCH_H
+#define LAYER_SLICE_LOONGARCH_H
+
+#include "slice.h"
+
+namespace ncnn {
+
+class Slice_loongarch : virtual public Slice
+{
+public:
+    Slice_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SLICE_LOONGARCH_H
diff --git a/src/layer/loongarch/softmax_loongarch.cpp b/src/layer/loongarch/softmax_loongarch.cpp
new file mode 100644
index 00000000000..88b49559754
--- /dev/null
+++ b/src/layer/loongarch/softmax_loongarch.cpp
@@ -0,0 +1,175 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "softmax_loongarch.h"
+
+#include <float.h>
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+int Softmax_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    size_t elemsize = bottom_top_blob.elemsize;
+    int positive_axis = axis < 0 ? dims + axis : axis;
+
+    if (dims != 3 || positive_axis != 0)
+        return Softmax::forward_inplace(bottom_top_blob, opt);
+
+    // value = exp( value - global max value )
+    // sum all value
+    // value = value / sum
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    Mat max;
+    max.create(w, h, elemsize, opt.workspace_allocator);
+    if (max.empty())
+        return -100;
+    max.fill(-FLT_MAX);
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i = 0; i < size; i++)
+        {
+            maxptr[i] = std::max(maxptr[i], ptr[i]);
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _max = (__m128)__lsx_vld(maxptr, 0);
+
+            _p = exp_ps(__lsx_vfsub_s(_p, _max));
+
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+            maxptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *ptr = exp(*ptr - *maxptr);
+
+            ptr++;
+            maxptr++;
+        }
+    }
+
+    Mat sum;
+    sum.create(w, h, elemsize, opt.workspace_allocator);
+    if (sum.empty())
+        return -100;
+    sum.fill(0.f);
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _sum = (__m128)__lsx_vld(sumptr, 0);
+            _sum = __lsx_vfadd_s(_sum, _p);
+            __lsx_vst(_sum, sumptr, 0);
+
+            ptr += 4;
+            sumptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *sumptr += *ptr;
+
+            ptr++;
+            sumptr++;
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _sum = (__m128)__lsx_vld(sumptr, 0);
+            _p = __lsx_vfdiv_s(_p, _sum);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+            sumptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *ptr /= *sumptr;
+
+            ptr++;
+            sumptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/softmax_loongarch.h b/src/layer/loongarch/softmax_loongarch.h
new file mode 100644
index 00000000000..3c8272a6412
--- /dev/null
+++ b/src/layer/loongarch/softmax_loongarch.h
@@ -0,0 +1,30 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SOFTMAX_LOONGARCH_H
+#define LAYER_SOFTMAX_LOONGARCH_H
+
+#include "softmax.h"
+
+namespace ncnn {
+
+class Softmax_loongarch : virtual public Softmax
+{
+public:
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SOFTMAX_LOONGARCH_H
diff --git a/src/layer/loongarch/swish_loongarch.cpp b/src/layer/loongarch/swish_loongarch.cpp
new file mode 100644
index 00000000000..9c9005de6fc
--- /dev/null
+++ b/src/layer/loongarch/swish_loongarch.cpp
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "swish_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include <math.h>
+
+namespace ncnn {
+
+Swish_loongarch::Swish_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Swish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128i _p = __lsx_vld(ptr, 0);
+            _p = (__m128i)__lsx_vfdiv_s((__m128)_p, __lsx_vfadd_s(_one, exp_ps((__m128)__lsx_vbitrevi_w(_p, 31))));
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr / (1.f + exp(-*ptr));
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/swish_loongarch.h b/src/layer/loongarch/swish_loongarch.h
new file mode 100644
index 00000000000..b8d0b80f01e
--- /dev/null
+++ b/src/layer/loongarch/swish_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SWISH_LOONGARCH_H
+#define LAYER_SWISH_LOONGARCH_H
+
+#include "swish.h"
+
+namespace ncnn {
+
+class Swish_loongarch : virtual public Swish
+{
+public:
+    Swish_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SWISH_LOONGARCH_H
diff --git a/src/layer/loongarch/tanh_loongarch.cpp b/src/layer/loongarch/tanh_loongarch.cpp
new file mode 100644
index 00000000000..13227fa71e3
--- /dev/null
+++ b/src/layer/loongarch/tanh_loongarch.cpp
@@ -0,0 +1,69 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tanh_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include <math.h>
+
+namespace ncnn {
+
+TanH_loongarch::TanH_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int TanH_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = tanh_ps(_p);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = tanh(*ptr);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/tanh_loongarch.h b/src/layer/loongarch/tanh_loongarch.h
new file mode 100644
index 00000000000..ecbab01ec8f
--- /dev/null
+++ b/src/layer/loongarch/tanh_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_TANH_LOONGARCH_H
+#define LAYER_TANH_LOONGARCH_H
+
+#include "tanh.h"
+
+namespace ncnn {
+
+class TanH_loongarch : virtual public TanH
+{
+public:
+    TanH_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_TANH_LOONGARCH_H
diff --git a/src/layer/loongarch/unaryop_loongarch.cpp b/src/layer/loongarch/unaryop_loongarch.cpp
new file mode 100644
index 00000000000..892c4dc4260
--- /dev/null
+++ b/src/layer/loongarch/unaryop_loongarch.cpp
@@ -0,0 +1,427 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "unaryop_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+UnaryOp_loongarch::UnaryOp_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+template<typename Op>
+static int unary_op_inplace(Mat& a, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = a.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = op.func_pack4(_p);
+            __lsx_vst(_p, ptr, 0);
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = op.func(*ptr);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+namespace UnaryOp_loongarch_functor {
+
+struct unary_op_abs
+{
+    float func(const float& x) const
+    {
+        return (float)fabs(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return (__m128)__lsx_vbitclri_w((__m128i)x, 31);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_neg
+{
+    float func(const float& x) const
+    {
+        return -x;
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return (__m128)__lsx_vbitrevi_w((__m128i)x, 31);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_floor
+{
+    float func(const float& x) const
+    {
+        return (float)floor(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = floor(tmp[0]);
+        tmp[1] = floor(tmp[1]);
+        tmp[2] = floor(tmp[2]);
+        tmp[3] = floor(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_ceil
+{
+    float func(const float& x) const
+    {
+        return (float)ceil(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = ceil(tmp[0]);
+        tmp[1] = ceil(tmp[1]);
+        tmp[2] = ceil(tmp[2]);
+        tmp[3] = ceil(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_square
+{
+    float func(const float& x) const
+    {
+        return x * x;
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfmul_s(x, x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_sqrt
+{
+    float func(const float& x) const
+    {
+        return (float)sqrt(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfsqrt_s(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_rsqrt
+{
+    float func(const float& x) const
+    {
+        return (float)(1.f / sqrt(x));
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfrsqrt_s(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_exp
+{
+    float func(const float& x) const
+    {
+        return (float)exp(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return exp_ps(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_log
+{
+    float func(const float& x) const
+    {
+        return (float)log(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return log_ps(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_sin
+{
+    float func(const float& x) const
+    {
+        return (float)sin(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = sin(tmp[0]);
+        tmp[1] = sin(tmp[1]);
+        tmp[2] = sin(tmp[2]);
+        tmp[3] = sin(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_cos
+{
+    float func(const float& x) const
+    {
+        return (float)cos(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = cos(tmp[0]);
+        tmp[1] = cos(tmp[1]);
+        tmp[2] = cos(tmp[2]);
+        tmp[3] = cos(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_tan
+{
+    float func(const float& x) const
+    {
+        return (float)tan(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = tan(tmp[0]);
+        tmp[1] = tan(tmp[1]);
+        tmp[2] = tan(tmp[2]);
+        tmp[3] = tan(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_asin
+{
+    float func(const float& x) const
+    {
+        return (float)asin(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = asin(tmp[0]);
+        tmp[1] = asin(tmp[1]);
+        tmp[2] = asin(tmp[2]);
+        tmp[3] = asin(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_acos
+{
+    float func(const float& x) const
+    {
+        return (float)acos(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = acos(tmp[0]);
+        tmp[1] = acos(tmp[1]);
+        tmp[2] = acos(tmp[2]);
+        tmp[3] = acos(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_atan
+{
+    float func(const float& x) const
+    {
+        return (float)atan(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = atan(tmp[0]);
+        tmp[1] = atan(tmp[1]);
+        tmp[2] = atan(tmp[2]);
+        tmp[3] = atan(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_reciprocal
+{
+    float func(const float& x) const
+    {
+        return 1.f / x;
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfrecip_s(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_tanh
+{
+    float func(const float& x) const
+    {
+        return (float)tanh(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return tanh_ps(x);
+    }
+#endif // __loongarch_sx
+};
+
+} // namespace UnaryOp_loongarch_functor
+
+int UnaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    using namespace UnaryOp_loongarch_functor;
+
+    if (op_type == Operation_ABS)
+        return unary_op_inplace<unary_op_abs>(bottom_top_blob, opt);
+
+    if (op_type == Operation_NEG)
+        return unary_op_inplace<unary_op_neg>(bottom_top_blob, opt);
+
+    if (op_type == Operation_FLOOR)
+        return unary_op_inplace<unary_op_floor>(bottom_top_blob, opt);
+
+    if (op_type == Operation_CEIL)
+        return unary_op_inplace<unary_op_ceil>(bottom_top_blob, opt);
+
+    if (op_type == Operation_SQUARE)
+        return unary_op_inplace<unary_op_square>(bottom_top_blob, opt);
+
+    if (op_type == Operation_SQRT)
+        return unary_op_inplace<unary_op_sqrt>(bottom_top_blob, opt);
+
+    if (op_type == Operation_RSQRT)
+        return unary_op_inplace<unary_op_rsqrt>(bottom_top_blob, opt);
+
+    if (op_type == Operation_EXP)
+        return unary_op_inplace<unary_op_exp>(bottom_top_blob, opt);
+
+    if (op_type == Operation_LOG)
+        return unary_op_inplace<unary_op_log>(bottom_top_blob, opt);
+
+    if (op_type == Operation_SIN)
+        return unary_op_inplace<unary_op_sin>(bottom_top_blob, opt);
+
+    if (op_type == Operation_COS)
+        return unary_op_inplace<unary_op_cos>(bottom_top_blob, opt);
+
+    if (op_type == Operation_TAN)
+        return unary_op_inplace<unary_op_tan>(bottom_top_blob, opt);
+
+    if (op_type == Operation_ASIN)
+        return unary_op_inplace<unary_op_asin>(bottom_top_blob, opt);
+
+    if (op_type == Operation_ACOS)
+        return unary_op_inplace<unary_op_acos>(bottom_top_blob, opt);
+
+    if (op_type == Operation_ATAN)
+        return unary_op_inplace<unary_op_atan>(bottom_top_blob, opt);
+
+    if (op_type == Operation_RECIPROCAL)
+        return unary_op_inplace<unary_op_reciprocal>(bottom_top_blob, opt);
+
+    if (op_type == Operation_TANH)
+        return unary_op_inplace<unary_op_tanh>(bottom_top_blob, opt);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/unaryop_loongarch.h b/src/layer/loongarch/unaryop_loongarch.h
new file mode 100644
index 00000000000..8170bec50cf
--- /dev/null
+++ b/src/layer/loongarch/unaryop_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_UNARYOP_LOONGARCH_H
+#define LAYER_UNARYOP_LOONGARCH_H
+
+#include "unaryop.h"
+
+namespace ncnn {
+
+class UnaryOp_loongarch : virtual public UnaryOp
+{
+public:
+    UnaryOp_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_UNARYOP_LOONGARCH_H
diff --git a/src/layer/lstm.cpp b/src/layer/lstm.cpp
index 6749f05e019..a065bcacae3 100644
--- a/src/layer/lstm.cpp
+++ b/src/layer/lstm.cpp
@@ -29,6 +29,7 @@ int LSTM::load_param(const ParamDict& pd)
     num_output = pd.get(0, 0);
     weight_data_size = pd.get(1, 0);
     direction = pd.get(2, 0);
+    hidden_size = pd.get(3, num_output);
     return 0;
 }
 
@@ -36,36 +37,52 @@ int LSTM::load_model(const ModelBin& mb)
 {
     int num_directions = direction == 2 ? 2 : 1;
 
-    int size = weight_data_size / num_directions / num_output / 4;
+    int size = weight_data_size / num_directions / hidden_size / 4;
 
     // raw weight data
-    weight_xc_data = mb.load(size, num_output * 4, num_directions, 0);
+    weight_xc_data = mb.load(size, hidden_size * 4, num_directions, 0);
     if (weight_xc_data.empty())
         return -100;
 
-    bias_c_data = mb.load(num_output, 4, num_directions, 0);
+    bias_c_data = mb.load(hidden_size, 4, num_directions, 0);
     if (bias_c_data.empty())
         return -100;
 
-    weight_hc_data = mb.load(num_output, num_output * 4, num_directions, 0);
+    weight_hc_data = mb.load(num_output, hidden_size * 4, num_directions, 0);
     if (weight_hc_data.empty())
         return -100;
 
+    if (num_output != hidden_size)
+    {
+        weight_hr_data = mb.load(hidden_size, num_output, num_directions, 0);
+        if (weight_hr_data.empty())
+            return -100;
+    }
+
     return 0;
 }
 
-static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
+static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
 {
     int size = bottom_blob.w;
     int T = bottom_blob.h;
 
     int num_output = top_blob.w;
+    int hidden_size = cell_state.w;
 
-    // 4 x num_output
-    Mat gates(4, num_output, 4u, opt.workspace_allocator);
+    // 4 x hidden_size
+    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
     if (gates.empty())
         return -100;
 
+    Mat tmp_hidden_state;
+    if (num_output != hidden_size)
+    {
+        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
+        if (tmp_hidden_state.empty())
+            return -100;
+    }
+
     // unroll
     for (int t = 0; t < T; t++)
     {
@@ -80,7 +97,7 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
 
         const float* x = bottom_blob.row(ti);
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
+        for (int q = 0; q < hidden_size; q++)
         {
             const float* bias_c_I = bias_c.row(0);
             const float* bias_c_F = bias_c.row(1);
@@ -90,15 +107,15 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
             float* gates_data = gates.row(q);
 
             // gate I F O G
-            const float* weight_xc_I = weight_xc.row(num_output * 0 + q);
-            const float* weight_xc_F = weight_xc.row(num_output * 1 + q);
-            const float* weight_xc_O = weight_xc.row(num_output * 2 + q);
-            const float* weight_xc_G = weight_xc.row(num_output * 3 + q);
+            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
 
-            const float* weight_hc_I = weight_hc.row(num_output * 0 + q);
-            const float* weight_hc_F = weight_hc.row(num_output * 1 + q);
-            const float* weight_hc_O = weight_hc.row(num_output * 2 + q);
-            const float* weight_hc_G = weight_hc.row(num_output * 3 + q);
+            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
 
             float I = bias_c_I[q];
             float F = bias_c_F[q];
@@ -140,7 +157,7 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
         // h_t := o_t .* tanh[c_t]
         float* output_data = top_blob.row(ti);
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
+        for (int q = 0; q < hidden_size; q++)
         {
             const float* gates_data = gates.row(q);
 
@@ -157,8 +174,34 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
             float cell2 = F * cell_state[q] + I * G;
             float H = O * tanh(cell2);
             cell_state[q] = cell2;
-            hidden_state[q] = H;
-            output_data[q] = H;
+
+            if (num_output == hidden_size)
+            {
+                hidden_state[q] = H;
+                output_data[q] = H;
+            }
+            else
+            {
+                tmp_hidden_state[q] = H;
+            }
+        }
+
+        if (num_output != hidden_size)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < num_output; q++)
+            {
+                const float* hr = weight_hr.row(q);
+
+                float H = 0;
+                for (int i = 0; i < hidden_size; i++)
+                {
+                    H += tmp_hidden_state[i] * hr[i];
+                }
+
+                hidden_state[q] = H;
+                output_data[q] = H;
+            }
         }
     }
 
@@ -177,7 +220,7 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         return -100;
     hidden.fill(0.f);
 
-    Mat cell(num_output, 4u, opt.workspace_allocator);
+    Mat cell(hidden_size, 4u, opt.workspace_allocator);
     if (cell.empty())
         return -100;
     cell.fill(0.f);
@@ -189,7 +232,7 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
+        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -204,14 +247,14 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         if (top_blob_reverse.empty())
             return -100;
 
-        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
+        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret0 != 0)
             return ret0;
 
         hidden.fill(0.0f);
         cell.fill(0.0f);
 
-        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, cell, opt);
+        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -251,7 +294,7 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             return -100;
         hidden.fill(0.f);
 
-        cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
+        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
         if (cell.empty())
             return -100;
         cell.fill(0.f);
@@ -265,7 +308,7 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
+        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -282,13 +325,13 @@ int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
         Mat hidden0 = hidden.row_range(0, 1);
         Mat cell0 = cell.row_range(0, 1);
-        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt);
+        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
         if (ret0 != 0)
             return ret0;
 
         Mat hidden1 = hidden.row_range(1, 1);
         Mat cell1 = cell.row_range(1, 1);
-        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt);
+        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
         if (ret1 != 0)
             return ret1;
 
diff --git a/src/layer/lstm.h b/src/layer/lstm.h
index 78d8366a0f9..58bd67f987a 100644
--- a/src/layer/lstm.h
+++ b/src/layer/lstm.h
@@ -36,10 +36,12 @@ class LSTM : public Layer
     int num_output;
     int weight_data_size;
     int direction; // 0=forward 1=reverse 2=bidirectional
+    int hidden_size;
 
     Mat weight_hc_data;
     Mat weight_xc_data;
     Mat bias_c_data;
+    Mat weight_hr_data;
 };
 
 } // namespace ncnn
diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp
index ac26f599f04..966df81d41d 100644
--- a/src/layer/multiheadattention.cpp
+++ b/src/layer/multiheadattention.cpp
@@ -27,6 +27,8 @@ int MultiHeadAttention::load_param(const ParamDict& pd)
     embed_dim = pd.get(0, 0);
     num_head = pd.get(1, 1);
     weight_data_size = pd.get(2, 0);
+    kdim = pd.get(3, embed_dim);
+    vdim = pd.get(4, embed_dim);
 
     return 0;
 }
@@ -41,7 +43,7 @@ int MultiHeadAttention::load_model(const ModelBin& mb)
     if (q_bias_data.empty())
         return -100;
 
-    k_weight_data = mb.load(weight_data_size, 0);
+    k_weight_data = mb.load(embed_dim * kdim, 0);
     if (k_weight_data.empty())
         return -100;
 
@@ -49,7 +51,7 @@ int MultiHeadAttention::load_model(const ModelBin& mb)
     if (k_bias_data.empty())
         return -100;
 
-    v_weight_data = mb.load(weight_data_size, 0);
+    v_weight_data = mb.load(embed_dim * vdim, 0);
     if (v_weight_data.empty())
         return -100;
 
@@ -73,23 +75,26 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
 {
     const Mat& q_blob = bottom_blobs[0];
     const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1];
-    const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2];
+    const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2];
 
-    const int seqlen = q_blob.h;
+    const int src_seqlen = q_blob.h;
+    const int dst_seqlen = k_blob.h;
     const int embed_dim_per_head = embed_dim / num_head;
 
+    // assert k_blob.h == v_blob.h
+
     Mat& top_blob = top_blobs[0];
-    top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator);
+    top_blob.create(embed_dim, src_seqlen, 4u, opt.blob_allocator);
     if (top_blob.empty())
         return -1;
 
-    Mat xq(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator);
-    Mat xk(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator);
-    Mat xv(seqlen, embed_dim_per_head, num_head, 4u, opt.workspace_allocator);
+    Mat xq(embed_dim_per_head, src_seqlen, num_head, 4u, opt.workspace_allocator);
+    Mat xk(embed_dim_per_head, dst_seqlen, num_head, 4u, opt.workspace_allocator);
+    Mat xv(dst_seqlen, embed_dim_per_head, num_head, 4u, opt.workspace_allocator);
 
-    Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator);
+    Mat xqk(dst_seqlen, src_seqlen, num_head, 4u, opt.workspace_allocator);
 
-    Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator);
+    Mat xqkv(embed_dim_per_head, num_head, src_seqlen, 4u, opt.workspace_allocator);
 
     const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head);
 
@@ -100,7 +105,7 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         {
             Mat outm = xq.channel(q);
 
-            for (int i = 0; i < seqlen; i++)
+            for (int i = 0; i < src_seqlen; i++)
             {
                 float* outptr = outm.row(i);
 
@@ -124,17 +129,17 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         {
             Mat outm = xk.channel(q);
 
-            for (int i = 0; i < seqlen; i++)
+            for (int i = 0; i < dst_seqlen; i++)
             {
                 float* outptr = outm.row(i);
 
                 for (int j = 0; j < embed_dim_per_head; j++)
                 {
                     const float* ptr = k_blob.row(i);
-                    const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j);
+                    const float* kptr = (const float*)k_weight_data + kdim * (q * embed_dim_per_head + j);
 
                     float sum = k_bias_data[q * embed_dim_per_head + j];
-                    for (int k = 0; k < embed_dim; k++)
+                    for (int k = 0; k < kdim; k++)
                     {
                         sum += *ptr++ * *kptr++;
                     }
@@ -150,13 +155,13 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
 
             for (int i = 0; i < embed_dim_per_head; i++)
             {
-                for (int j = 0; j < seqlen; j++)
+                for (int j = 0; j < dst_seqlen; j++)
                 {
                     const float* ptr = v_blob.row(j);
-                    const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i);
+                    const float* kptr = (const float*)v_weight_data + vdim * (q * embed_dim_per_head + i);
 
                     float sum = v_bias_data[q * embed_dim_per_head + i];
-                    for (int k = 0; k < embed_dim; k++)
+                    for (int k = 0; k < vdim; k++)
                     {
                         sum += *ptr++ * *kptr++;
                     }
@@ -169,19 +174,19 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         }
 
         // xqk = xq * xk
-        // xq  (embed_dim_per_head, seqlen)
-        // xk  (embed_dim_per_head, seqlen)
+        // xq  (embed_dim_per_head, src_seqlen)
+        // xk  (embed_dim_per_head, dst_seqlen)
         {
             const Mat xqm = xq.channel(q);
             const Mat xkm = xk.channel(q);
 
             Mat outm = xqk.channel(q);
 
-            for (int i = 0; i < seqlen; i++)
+            for (int i = 0; i < src_seqlen; i++)
             {
                 float* outptr = outm.row(i);
 
-                for (int j = 0; j < seqlen; j++)
+                for (int j = 0; j < dst_seqlen; j++)
                 {
                     const float* qptr = xqm.row(i);
                     const float* kptr = xkm.row(j);
@@ -201,24 +206,24 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         {
             Mat outm = xqk.channel(q);
 
-            for (int i = 0; i < seqlen; i++)
+            for (int i = 0; i < src_seqlen; i++)
             {
                 float* ptr = outm.row(i);
 
                 float max = -FLT_MAX;
-                for (int j = 0; j < seqlen; j++)
+                for (int j = 0; j < dst_seqlen; j++)
                 {
                     max = std::max(max, ptr[j]);
                 }
 
                 float sum = 0.f;
-                for (int j = 0; j < seqlen; j++)
+                for (int j = 0; j < dst_seqlen; j++)
                 {
                     ptr[j] = (float)(exp(ptr[j] - max));
                     sum += ptr[j];
                 }
 
-                for (int j = 0; j < seqlen; j++)
+                for (int j = 0; j < dst_seqlen; j++)
                 {
                     ptr[j] /= sum;
                 }
@@ -226,14 +231,14 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         }
 
         // xqkv = xqk * xv
-        // xqk (seqlen, seqlen)
-        // xv  (seqlen, embed_dim_per_head)
-        // out (embed_dim_per_head, num_head, seqlen)
+        // xqk (dst_seqlen, src_seqlen)
+        // xv  (dst_seqlen, embed_dim_per_head)
+        // out (embed_dim_per_head, num_head, src_seqlen)
         {
             const Mat xqkm = xqk.channel(q);
             const Mat xvm = xv.channel(q);
 
-            for (int i = 0; i < seqlen; i++)
+            for (int i = 0; i < src_seqlen; i++)
             {
                 float* outptr = xqkv.channel(i).row(q);
 
@@ -243,7 +248,7 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
                     const float* vptr = xvm.row(j);
 
                     float sum = 0.f;
-                    for (int k = 0; k < seqlen; k++)
+                    for (int k = 0; k < dst_seqlen; k++)
                     {
                         sum += *qkptr++ * *vptr++;
                     }
@@ -255,9 +260,9 @@ int MultiHeadAttention::forward(const std::vector<Mat>& bottom_blobs, std::vecto
     }
 
     // out = affine(xqkv)
-    // xqkv  (embed_dim, seqlen)
+    // xqkv  (embed_dim, src_seqlen)
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int i = 0; i < seqlen; i++)
+    for (int i = 0; i < src_seqlen; i++)
     {
         float* outptr = top_blob.row(i);
 
diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h
index b878055385d..2de5213ca31 100644
--- a/src/layer/multiheadattention.h
+++ b/src/layer/multiheadattention.h
@@ -34,6 +34,8 @@ class MultiHeadAttention : public Layer
     int embed_dim;
     int num_head;
     int weight_data_size;
+    int kdim;
+    int vdim;
 
     Mat q_weight_data;
     Mat q_bias_data;
diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp
index 4a41788ec9e..092a8b5d6b1 100644
--- a/src/layer/riscv/absval_riscv.cpp
+++ b/src/layer/riscv/absval_riscv.cpp
@@ -66,7 +66,7 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfabs_v_f32m8_absval(_p, vl);
@@ -106,7 +106,7 @@ int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfabs_v_f16m8_absval(_p, vl);
diff --git a/src/layer/riscv/batchnorm_riscv.cpp b/src/layer/riscv/batchnorm_riscv.cpp
new file mode 100644
index 00000000000..2a8ec0cce58
--- /dev/null
+++ b/src/layer/riscv/batchnorm_riscv.cpp
@@ -0,0 +1,537 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+BatchNorm_riscv::BatchNorm_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#if __riscv_zfh
+    support_fp16_storage = true;
+#endif
+#endif // __riscv_vector
+}
+
+int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+#if __riscv_vector
+    int elembits = bottom_top_blob.elembits();
+
+#if __riscv_zfh
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_inplace_fp16sa(bottom_top_blob, opt);
+        else
+            return forward_inplace_fp16s(bottom_top_blob, opt);
+    }
+#endif
+    int elempack = bottom_top_blob.elempack;
+#endif // __riscv_vector
+    int dims = bottom_top_blob.dims;
+    if (dims == 1)
+    {
+        float* ptr = bottom_top_blob;
+#if __riscv_vector
+        const float* ptr_a = a_data;
+        const float* ptr_b = b_data;
+        int n = bottom_top_blob.w * elempack;
+        while (n > 0)
+        {
+            size_t vl = vsetvl_e32m8(n);
+
+            vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+            vfloat32m8_t _a = vle32_v_f32m8(ptr_a, vl);
+            vfloat32m8_t _b = vle32_v_f32m8(ptr_b, vl);
+
+            _p = vfmadd_vv_f32m8(_p, _b, _a, vl);
+
+            vse32_v_f32m8(ptr, _p, vl);
+
+            ptr += vl;
+            ptr_a += vl;
+            ptr_b += vl;
+            n -= vl;
+        }
+#else
+        int w = bottom_top_blob.w;
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < w; i++)
+        {
+            ptr[i] = b_data[i] * ptr[i] + a_data[i];
+        }
+#endif // __riscv_vector
+        return 0;
+    }
+
+#if __riscv_vector
+    if (elempack == 1)
+#endif
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        if (dims == 2)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                float* ptr = bottom_top_blob.row(i);
+                float a = a_data[i];
+                float b = b_data[i];
+
+#if __riscv_vector
+                int n = w;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                    _p = vfmul_vf_f32m8(_p, b, vl);
+                    _p = vfadd_vf_f32m8(_p, a, vl);
+                    vse32_v_f32m8(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int j = 0; j < w; j++)
+                {
+                    ptr[j] = b * ptr[j] + a;
+                }
+#endif // __riscv_vector
+            }
+        }
+        if (dims == 3 || dims == 4)
+        {
+            int d = bottom_top_blob.d;
+            int c = bottom_top_blob.c;
+            int size = w * h * d;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < c; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+                float a = a_data[q];
+                float b = b_data[q];
+
+#if __riscv_vector
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
+                    _p = vfmul_vf_f32m8(_p, b, vl);
+                    _p = vfadd_vf_f32m8(_p, a, vl);
+                    vse32_v_f32m8(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+#else
+                for (int i = 0; i < size; i++)
+                {
+                    ptr[i] = b * ptr[i] + a;
+                }
+#endif // __riscv_vector
+            }
+        }
+        return 0;
+    }
+
+#if __riscv_vector
+    const int packn = csrr_vlenb() / 4;
+    if (elempack == packn)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+
+        const size_t vl = vsetvl_e32m1(packn);
+        if (dims == 2)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                float* ptr = bottom_top_blob.row(i);
+                const float* ptr_a = a_data;
+                ptr_a += i * elempack;
+                const float* ptr_b = b_data;
+                ptr_b += i * elempack;
+                int n = w * elempack;
+
+                vfloat32m1_t _a = vle32_v_f32m1(ptr_a, vl);
+                vfloat32m1_t _b = vle32_v_f32m1(ptr_b, vl);
+                while (n > 0)
+                {
+                    vfloat32m1_t _p = vle32_v_f32m1(ptr, vl);
+                    _p = vfmadd_vv_f32m1(_p, _b, _a, vl);
+                    vse32_v_f32m1(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+
+        if (dims == 3 || dims == 4)
+        {
+            int d = bottom_top_blob.d;
+            int c = bottom_top_blob.c;
+            int size = w * h * d * elempack;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < c; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+                const float* ptr_a = (const float*)a_data + q * elempack;
+                const float* ptr_b = (const float*)b_data + q * elempack;
+
+                vfloat32m1_t _a = vle32_v_f32m1(ptr_a, vl);
+                vfloat32m1_t _b = vle32_v_f32m1(ptr_b, vl);
+
+                int n = size;
+                while (n > 0)
+                {
+                    vfloat32m1_t _p = vle32_v_f32m1(ptr, vl);
+                    _p = vfmadd_vv_f32m1(_p, _b, _a, vl);
+                    vse32_v_f32m1(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+    }
+#endif
+    return 0;
+}
+
+#if __riscv_vector && __riscv_zfh
+int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+    if (dims == 1)
+    {
+        int n = bottom_top_blob.w * elempack;
+        __fp16* ptr = bottom_top_blob;
+        const float* ptr_a = a_data;
+        const float* ptr_b = b_data;
+        while (n > 0)
+        {
+            size_t vl = vsetvl_e16m4(n);
+
+            vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
+            vfloat32m8_t _a = vle32_v_f32m8(ptr_a, vl);
+            vfloat32m8_t _b = vle32_v_f32m8(ptr_b, vl);
+
+            _p = vfmadd_vv_f32m8(_p, _b, _a, vl);
+
+            vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+            ptr += vl;
+            ptr_a += vl;
+            ptr_b += vl;
+            n -= vl;
+        }
+
+        return 0;
+    }
+
+    if (elempack == 1)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        if (dims == 2)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                float a = a_data[i];
+                float b = b_data[i];
+
+                int n = w;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e16m4(n);
+                    vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
+                    _p = vfmul_vf_f32m8(_p, b, vl);
+                    _p = vfadd_vf_f32m8(_p, a, vl);
+                    vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+        if (dims == 3 || dims == 4)
+        {
+            int d = bottom_top_blob.d;
+            int c = bottom_top_blob.c;
+            int size = w * h * d;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < c; q++)
+            {
+                __fp16* ptr = bottom_top_blob.channel(q);
+                float a = a_data[q];
+                float b = b_data[q];
+
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e16m4(n);
+                    vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
+                    ;
+                    _p = vfmul_vf_f32m8(_p, b, vl);
+                    _p = vfadd_vf_f32m8(_p, a, vl);
+                    vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    const int packn = csrr_vlenb() / 2; // fp16
+    if (elempack == packn)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+
+        const size_t vl = vsetvl_e16m1(packn);
+        if (dims == 2)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                const float* ptr_a = (const float*)a_data + i * elempack;
+                const float* ptr_b = (const float*)b_data + i * elempack;
+                int n = w * elempack;
+
+                vfloat32m2_t _a = vle32_v_f32m2(ptr_a, vl);
+                vfloat32m2_t _b = vle32_v_f32m2(ptr_b, vl);
+                while (n > 0)
+                {
+                    vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl);
+                    _p = vfmadd_vv_f32m2(_p, _b, _a, vl);
+                    vse16_v_f16m1(ptr, vfncvt_f_f_w_f16m1(_p, vl), vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+
+        if (dims == 3 || dims == 4)
+        {
+            int d = bottom_top_blob.d;
+            int c = bottom_top_blob.c;
+            int size = w * h * d * elempack;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < c; q++)
+            {
+                __fp16* ptr = bottom_top_blob.channel(q);
+                const float* ptr_a = (const float*)a_data + q * elempack;
+                const float* ptr_b = (const float*)b_data + q * elempack;
+
+                vfloat32m2_t _a = vle32_v_f32m2(ptr_a, vl);
+                vfloat32m2_t _b = vle32_v_f32m2(ptr_b, vl);
+
+                int n = size;
+                while (n > 0)
+                {
+                    vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl);
+                    _p = vfmadd_vv_f32m2(_p, _b, _a, vl);
+                    vse16_v_f16m1(ptr, vfncvt_f_f_w_f16m1(_p, vl), vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+    if (dims == 1)
+    {
+        int n = bottom_top_blob.w * elempack;
+        __fp16* ptr = bottom_top_blob;
+        const float* ptr_a = a_data;
+        const float* ptr_b = b_data;
+        while (n > 0)
+        {
+            size_t vl = vsetvl_e16m4(n);
+
+            vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
+            vfloat16m4_t _a = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_a, vl), vl);
+            vfloat16m4_t _b = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_b, vl), vl);
+
+            _p = vfmadd_vv_f16m4(_p, _b, _a, vl);
+
+            vse16_v_f16m4(ptr, _p, vl);
+
+            ptr += vl;
+            ptr_a += vl;
+            ptr_b += vl;
+            n -= vl;
+        }
+
+        return 0;
+    }
+
+    if (elempack == 1)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        if (dims == 2)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                float a = a_data[i];
+                float b = b_data[i];
+
+                int n = w;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e16m8(n);
+                    vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
+                    _p = vfmul_vf_f16m8(_p, b, vl);
+                    _p = vfadd_vf_f16m8(_p, a, vl);
+                    vse16_v_f16m8(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+        if (dims == 3 || dims == 4)
+        {
+            int d = bottom_top_blob.d;
+            int c = bottom_top_blob.c;
+            int size = w * h * d;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < c; q++)
+            {
+                __fp16* ptr = bottom_top_blob.channel(q);
+                float a = a_data[q];
+                float b = b_data[q];
+
+                int n = size;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e16m8(n);
+                    vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
+                    ;
+                    _p = vfmul_vf_f16m8(_p, b, vl);
+                    _p = vfadd_vf_f16m8(_p, a, vl);
+                    vse16_v_f16m8(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    const int packn = csrr_vlenb() / 2; // fp16
+    if (elempack == packn)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+
+        const size_t vl = vsetvl_e16m1(packn);
+        if (dims == 2)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+                const float* ptr_a = (const float*)a_data + i * elempack;
+                const float* ptr_b = (const float*)b_data + i * elempack;
+                int n = w * elempack;
+
+                vfloat16m1_t _a = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_a, vl), vl);
+                vfloat16m1_t _b = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_b, vl), vl);
+                while (n > 0)
+                {
+                    vfloat16m1_t _p = vle16_v_f16m1(ptr, vl);
+                    _p = vfmadd_vv_f16m1(_p, _b, _a, vl);
+                    vse16_v_f16m1(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+
+        if (dims == 3 || dims == 4)
+        {
+            int d = bottom_top_blob.d;
+            int c = bottom_top_blob.c;
+            int size = w * h * d * elempack;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < c; q++)
+            {
+                __fp16* ptr = bottom_top_blob.channel(q);
+                const float* ptr_a = (const float*)a_data + q * elempack;
+                const float* ptr_b = (const float*)b_data + q * elempack;
+
+                vfloat16m1_t _a = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_a, vl), vl);
+                vfloat16m1_t _b = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_b, vl), vl);
+
+                int n = size;
+                while (n > 0)
+                {
+                    vfloat16m1_t _p = vle16_v_f16m1(ptr, vl);
+                    _p = vfmadd_vv_f16m1(_p, _b, _a, vl);
+                    vse16_v_f16m1(ptr, _p, vl);
+
+                    ptr += vl;
+                    n -= vl;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+#endif // __riscv_vector && __riscv_zfh
+} // namespace ncnn
diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h
new file mode 100644
index 00000000000..e2365fa5fcf
--- /dev/null
+++ b/src/layer/riscv/batchnorm_riscv.h
@@ -0,0 +1,37 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BATCHNORM_RISCV_H
+#define LAYER_BATCHNORM_RISCV_H
+
+#include "batchnorm.h"
+
+namespace ncnn {
+class BatchNorm_riscv : virtual public BatchNorm
+{
+public:
+    BatchNorm_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if __riscv_vector && __riscv_zfh
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BATCHNORM_RISCV_H
diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp
index b4e53a2c856..9858e654822 100644
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
@@ -67,7 +67,7 @@ static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option&
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vfloat32m8_t _outp = op(a0, _p, vl);
             vse32_v_f32m8(outptr, _outp, vl);
@@ -108,7 +108,7 @@ static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vfloat32m8_t _outp = op(_p, b0, vl);
             vse32_v_f32m8(outptr, _outp, vl);
@@ -149,7 +149,7 @@ static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -217,7 +217,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _outp = op(_p, _b0x, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -252,7 +252,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w * h * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _outp = op(_p, _b0x, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -289,7 +289,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _outp = op(_p, _b0x, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -328,7 +328,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _outp = op(_a0x, _p1, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -367,7 +367,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = size * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _outp = op(_p, _b0x, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -400,7 +400,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _outp = op(_p, *ptr1, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -436,7 +436,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n1 = size1 * elempack1;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n1);
+                        size_t vl = vsetvl_e32m8(n1);
                         vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                         vfloat32m8_t _outp = op(_a0x, _p1, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -469,7 +469,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n1 = elempack1;
                         while (n1 > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n1);
+                            size_t vl = vsetvl_e32m8(n1);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _p = vfmv_v_f_f32m8(*ptr, vl);
                             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -508,7 +508,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _outp = op(_p, _b0x, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -545,7 +545,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         const float* ptr1_vol = ptr1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1_vol, vl);
                             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -583,7 +583,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _outp = op(_a0x, _p1, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -620,7 +620,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         const float* ptr_vol = ptr;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -662,7 +662,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _outp = op(_p, _b0x, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -699,7 +699,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _outp = op(_p, _b0x, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -736,7 +736,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w1 * h1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                         vfloat32m8_t _outp = op(_a0x, _p1, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -774,7 +774,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                         vfloat32m8_t _outp = op(_a0x, _p1, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -825,7 +825,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _outp = op(_p, _b0x, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -867,7 +867,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n1);
+                    size_t vl = vsetvl_e32m8(n1);
                     vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                     vfloat32m8_t _outp = op(_a0x, _p1, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -899,7 +899,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n1);
+                    size_t vl = vsetvl_e32m8(n1);
                     vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                     vfloat32m8_t _outp = op(_a0x, _p1, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -931,7 +931,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = w1 * elempack1;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                     vfloat32m8_t _outp = op(_a0x, _p1, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -985,7 +985,7 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt)
         int n = size * elempack;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = op(_p, b, vl);
             vse32_v_f32m8(ptr, _p, vl);
@@ -1000,21 +1000,21 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt)
 
 namespace BinaryOp_riscv_functor {
 
-#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV)                                                     \
-    struct NAME                                                                                         \
-    {                                                                                                   \
-        vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const word_type vl) const \
-        {                                                                                               \
-            return IMPLVV;                                                                              \
-        }                                                                                               \
-        vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLVS;                                                                              \
-        }                                                                                               \
-        vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLSV;                                                                              \
-        }                                                                                               \
+#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV)                                                  \
+    struct NAME                                                                                      \
+    {                                                                                                \
+        vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const size_t vl) const \
+        {                                                                                            \
+            return IMPLVV;                                                                           \
+        }                                                                                            \
+        vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLVS;                                                                           \
+        }                                                                                            \
+        vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLSV;                                                                           \
+        }                                                                                            \
     };
 
 MAKE_FUNCTION(binary_op_add_rvv, vfadd_vv_f32m8(x, y, vl), vfadd_vf_f32m8(x, y, vl), vfadd_vf_f32m8(y, x, vl))
@@ -1159,7 +1159,7 @@ static int binary_op_2_3_4_20_fp16s(const Mat& a, const Mat& b, Mat& c, const Op
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             vfloat16m8_t _outp = op(a0, _p, vl);
             vse16_v_f16m8(outptr, _outp, vl);
@@ -1200,7 +1200,7 @@ static int binary_op_6_11_16_25_fp16s(const Mat& a, const Mat& b, Mat& c, const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             vfloat16m8_t _outp = op(_p, b0, vl);
             vse16_v_f16m8(outptr, _outp, vl);
@@ -1241,7 +1241,7 @@ static int binary_op_7_13_19_29_fp16s(const Mat& a, const Mat& b, Mat& c, const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
             vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1309,7 +1309,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                             vfloat16m8_t _outp = op(_p, _b0x, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1344,7 +1344,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w * h * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                         vfloat16m8_t _outp = op(_p, _b0x, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1381,7 +1381,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vfloat16m8_t _outp = op(_p, _b0x, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1420,7 +1420,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                             vfloat16m8_t _outp = op(_a0x, _p1, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1459,7 +1459,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = size * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                         vfloat16m8_t _outp = op(_p, _b0x, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1492,7 +1492,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                             vfloat16m8_t _outp = op(_p, *ptr1, vl);
 
@@ -1527,7 +1527,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n1 = size1 * elempack1;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n1);
+                        size_t vl = vsetvl_e16m8(n1);
                         vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                         vfloat16m8_t _outp = op(_a0x, _p1, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1560,7 +1560,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n1 = elempack1;
                         while (n1 > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n1);
+                            size_t vl = vsetvl_e16m8(n1);
                             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                             vfloat16m8_t _p = vfmv_v_f_f16m8(*ptr, vl);
                             vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1598,7 +1598,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                             vfloat16m8_t _outp = op(_p, _b0x, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1637,7 +1637,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                             const __fp16* ptr1_vol = ptr1 + x * elempack;
                             while (n > 0)
                             {
-                                word_type vl = vsetvl_e16m8(n);
+                                size_t vl = vsetvl_e16m8(n);
                                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                                 vfloat16m8_t _p1 = vle16_v_f16m8(ptr1_vol, vl);
                                 vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1676,7 +1676,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                             vfloat16m8_t _outp = op(_a0x, _p1, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1715,7 +1715,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                             const __fp16* ptr_vol = ptr + x * elempack;
                             while (n > 0)
                             {
-                                word_type vl = vsetvl_e16m8(n);
+                                size_t vl = vsetvl_e16m8(n);
                                 vfloat16m8_t _p = vle16_v_f16m8(ptr_vol, vl);
                                 vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                                 vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1758,7 +1758,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                         vfloat16m8_t _outp = op(_p, _b0x, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1795,7 +1795,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vfloat16m8_t _outp = op(_p, _b0x, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1832,7 +1832,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w1 * h1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                         vfloat16m8_t _outp = op(_a0x, _p1, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1870,7 +1870,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                         vfloat16m8_t _outp = op(_a0x, _p1, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1921,7 +1921,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vfloat16m8_t _outp = op(_p, _b0x, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1963,7 +1963,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n1);
+                    size_t vl = vsetvl_e16m8(n1);
                     vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                     vfloat16m8_t _outp = op(_a0x, _p1, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1995,7 +1995,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n1);
+                    size_t vl = vsetvl_e16m8(n1);
                     vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                     vfloat16m8_t _outp = op(_a0x, _p1, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -2027,7 +2027,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = w1 * elempack1;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                     vfloat16m8_t _outp = op(_a0x, _p1, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -2706,7 +2706,7 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = op(_p, b, vl);
             vse16_v_f16m8(ptr, _p, vl);
@@ -2721,25 +2721,25 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt)
 
 namespace BinaryOp_riscv_functor {
 
-#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV)                                               \
-    struct NAME                                                                                         \
-    {                                                                                                   \
-        __fp16 operator()(const __fp16& x, const __fp16& y) const                                       \
-        {                                                                                               \
-            return IMPL;                                                                                \
-        }                                                                                               \
-        vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const word_type vl) const \
-        {                                                                                               \
-            return IMPLVV;                                                                              \
-        }                                                                                               \
-        vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLVS;                                                                              \
-        }                                                                                               \
-        vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLSV;                                                                              \
-        }                                                                                               \
+#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV)                                            \
+    struct NAME                                                                                      \
+    {                                                                                                \
+        __fp16 operator()(const __fp16& x, const __fp16& y) const                                    \
+        {                                                                                            \
+            return IMPL;                                                                             \
+        }                                                                                            \
+        vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const size_t vl) const \
+        {                                                                                            \
+            return IMPLVV;                                                                           \
+        }                                                                                            \
+        vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLVS;                                                                           \
+        }                                                                                            \
+        vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLSV;                                                                           \
+        }                                                                                            \
     };
 
 // clang-format off
diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp
index 8ea5d0f05ef..5d0642e7da7 100644
--- a/src/layer/riscv/cast_riscv.cpp
+++ b/src/layer/riscv/cast_riscv.cpp
@@ -101,7 +101,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat16m4_t _outp = vfncvt_f_f_w_f16m4(_p, vl);
@@ -125,7 +125,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
 
                 vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                 vfloat32m8_t _outp = vfwcvt_f_f_v_f32m8(_p, vl);
diff --git a/src/layer/riscv/clip_riscv.cpp b/src/layer/riscv/clip_riscv.cpp
index 9acff0218f0..8c43e06a4d8 100644
--- a/src/layer/riscv/clip_riscv.cpp
+++ b/src/layer/riscv/clip_riscv.cpp
@@ -62,7 +62,7 @@ int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfmax_vf_f32m8(_p, min, vl);
@@ -107,7 +107,7 @@ int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = vfmax_vf_f32m8(_p, min, vl);
@@ -139,7 +139,7 @@ int Clip_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfmax_vf_f16m8(_p, min, vl);
diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp
index d80d9985b47..5736fd25dcd 100644
--- a/src/layer/riscv/concat_riscv.cpp
+++ b/src/layer/riscv/concat_riscv.cpp
@@ -143,7 +143,7 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 for (int i = 0; i < bottom_blob.h; i++)
                 {
@@ -266,7 +266,7 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 int size = bottom_blob.w * bottom_blob.h;
 
@@ -487,7 +487,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std:
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 for (int i = 0; i < bottom_blob.h; i++)
                 {
@@ -610,7 +610,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std:
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int size = bottom_blob.w * bottom_blob.h;
 
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index a956d394f17..483aa511672 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -119,7 +119,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -476,7 +476,7 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)
 int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -697,7 +697,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/convolution_1x1_packn.h b/src/layer/riscv/convolution_1x1_packn.h
index 8f55d260abc..31bf72ba3d0 100644
--- a/src/layer/riscv/convolution_1x1_packn.h
+++ b/src/layer/riscv/convolution_1x1_packn.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, con
 static void conv1x1s2_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_1x1_packn_fp16s.h b/src/layer/riscv/convolution_1x1_packn_fp16s.h
index 110d61dc121..5ac3f8967ce 100644
--- a/src/layer/riscv/convolution_1x1_packn_fp16s.h
+++ b/src/layer/riscv/convolution_1x1_packn_fp16s.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_bl
 static void conv1x1s2_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_1x1_packnto1.h b/src/layer/riscv/convolution_1x1_packnto1.h
index 0cd1747586e..a3e1204a325 100644
--- a/src/layer/riscv/convolution_1x1_packnto1.h
+++ b/src/layer/riscv/convolution_1x1_packnto1.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void conv1x1s2_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h
index 04e86f97dca..10591ab27f2 100644
--- a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top
 static void conv1x1s2_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_3x3_pack1ton.h b/src/layer/riscv/convolution_3x3_pack1ton.h
index bb123ef8997..9adcfb1e263 100644
--- a/src/layer/riscv/convolution_3x3_pack1ton.h
+++ b/src/layer/riscv/convolution_3x3_pack1ton.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int inch = bottom_blob.c;
     int outw = top_blob.w;
@@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const
 static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
index e25c7d09097..bff24a0099f 100644
--- a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int inch = bottom_blob.c;
     int outw = top_blob.w;
@@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_7x7_pack1ton.h b/src/layer/riscv/convolution_7x7_pack1ton.h
index 06c4dfe2f6a..3605ed027cd 100644
--- a/src/layer/riscv/convolution_7x7_pack1ton.h
+++ b/src/layer/riscv/convolution_7x7_pack1ton.h
@@ -15,7 +15,7 @@
 static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
index 91ee1b7d826..01804bf391d 100644
--- a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_pack1ton.h b/src/layer/riscv/convolution_pack1ton.h
index f667f4d5d09..15eec7badd9 100644
--- a/src/layer/riscv/convolution_pack1ton.h
+++ b/src/layer/riscv/convolution_pack1ton.h
@@ -15,7 +15,7 @@
 static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_pack1ton_fp16s.h b/src/layer/riscv/convolution_pack1ton_fp16s.h
index fc486173031..6f8c649e632 100644
--- a/src/layer/riscv/convolution_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
@@ -95,7 +95,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
 static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packn.h b/src/layer/riscv/convolution_packn.h
index c9b51d07881..9d18c1d858e 100644
--- a/src/layer/riscv/convolution_packn.h
+++ b/src/layer/riscv/convolution_packn.h
@@ -15,7 +15,7 @@
 static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packn_fp16s.h b/src/layer/riscv/convolution_packn_fp16s.h
index 8ae4468495a..1f7b308e846 100644
--- a/src/layer/riscv/convolution_packn_fp16s.h
+++ b/src/layer/riscv/convolution_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
@@ -100,7 +100,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c
 static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packnto1.h b/src/layer/riscv/convolution_packnto1.h
index 7eda3858083..4c66116d20e 100644
--- a/src/layer/riscv/convolution_packnto1.h
+++ b/src/layer/riscv/convolution_packnto1.h
@@ -15,7 +15,7 @@
 static void convolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packnto1_fp16s.h b/src/layer/riscv/convolution_packnto1_fp16s.h
index 63aefbb5d5a..83efd3081f8 100644
--- a/src/layer/riscv/convolution_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_packnto1_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
@@ -109,7 +109,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
 static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm.h b/src/layer/riscv/convolution_sgemm.h
index c62db6c78ee..801b7cc456f 100644
--- a/src/layer/riscv/convolution_sgemm.h
+++ b/src/layer/riscv/convolution_sgemm.h
@@ -16,7 +16,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat&
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
diff --git a/src/layer/riscv/convolution_sgemm_fp16s.h b/src/layer/riscv/convolution_sgemm_fp16s.h
index 5cd5ea8a31e..72a621641db 100644
--- a/src/layer/riscv/convolution_sgemm_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_fp16s.h
@@ -16,7 +16,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 #endif
 
     // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator);
diff --git a/src/layer/riscv/convolution_sgemm_pack1ton.h b/src/layer/riscv/convolution_sgemm_pack1ton.h
index bc2f558a6d9..8a3e6ffbc43 100644
--- a/src/layer/riscv/convolution_sgemm_pack1ton.h
+++ b/src/layer/riscv/convolution_sgemm_pack1ton.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_pack1ton_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
 
diff --git a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
index c3590a6ed6b..0c0b2791a8f 100644
--- a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator);
 
diff --git a/src/layer/riscv/convolution_sgemm_packn.h b/src/layer/riscv/convolution_sgemm_packn.h
index 88518a23136..9255c092ae4 100644
--- a/src/layer/riscv/convolution_sgemm_packn.h
+++ b/src/layer/riscv/convolution_sgemm_packn.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator);
 
@@ -78,7 +78,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
                     vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl);
                     vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl);
                     vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl);
-                    vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -119,7 +119,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
                     vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl);
                     vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl);
-                    vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -156,7 +156,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
 #else
                     vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl);
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
-                    vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl);
+                    vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -363,7 +363,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
 static void convolution_im2col_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm_packn_fp16s.h b/src/layer/riscv/convolution_sgemm_packn_fp16s.h
index 977dc38204a..cb3b65196ed 100644
--- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator);
 
@@ -109,7 +109,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
                     vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl);
                     vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl);
                     vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl);
-                    vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -172,7 +172,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
                     vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl);
                     vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl);
-                    vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -228,7 +228,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
 #else
                     vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
-                    vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl);
+                    vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -435,7 +435,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
 static void convolution_im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm_packnto1.h b/src/layer/riscv/convolution_sgemm_packnto1.h
index 212cf98b39b..2df2c7d7656 100644
--- a/src/layer/riscv/convolution_sgemm_packnto1.h
+++ b/src/layer/riscv/convolution_sgemm_packnto1.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator);
 
@@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                     vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl);
                     vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl);
                     vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl);
-                    vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
                     vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl);
                     vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl);
-                    vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 #else
                     vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl);
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
-                    vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl);
+                    vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
     int nn_outch = outch / packn;
     int remain_outch_start = nn_outch * packn;
 
+#ifdef __clang__
+    // clang complains about VLA in the following loop
+    float* _zero_tmp = new float[packn]();
+    for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++)
+    {
+        _zero_tmp[_zero_clean_idx] = 0.f;
+    }
+#endif // __clang__
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_outch; pp++)
     {
@@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
         float* outptr0 = top_blob.channel(p);
 
+#ifdef __clang__
+        const float* zeros = _zero_tmp;
+#else
         const float zeros[packn] = {0.f};
+#endif // __clang__
         const float* biasptr = bias ? bias + p : zeros;
 
         int i = 0;
@@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             vsse32_v_f32m1(outptr0 + 6, top_blob.cstep * sizeof(float), _sum6, vl);
             vsse32_v_f32m1(outptr0 + 7, top_blob.cstep * sizeof(float), _sum7, vl);
 #else
-            vssseg8e32_v_f32m1x8(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl);
+            vssseg8e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl);
 #endif
             outptr0 += 8;
         }
@@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl);
             vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl);
 #else
-            vssseg4e32_v_f32m1x4(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x4(_sum0, _sum1, _sum2, _sum3), vl);
+            vssseg4e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, vl);
 #endif
             outptr0 += 4;
         }
@@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
             vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
 #else
-            vssseg2e32_v_f32m1x2(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x2(_sum0, _sum1), vl);
+            vssseg2e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, vl);
 #endif
             outptr0 += 2;
         }
@@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             outptr0 += 1;
         }
     }
+#ifdef __clang__
+    delete[] _zero_tmp;
+#endif
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_outch_start; p < outch; p++)
@@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat32m1x8_t _val01 = vlseg8e32_v_f32m1x8(tmpptr, vl);
+                vfloat32m1_t _val0;
+                vfloat32m1_t _val1;
+                vfloat32m1_t _val2;
+                vfloat32m1_t _val3;
+                vfloat32m1_t _val4;
+                vfloat32m1_t _val5;
+                vfloat32m1_t _val6;
+                vfloat32m1_t _val7;
+                vlseg8e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl);
                 vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x8_f32m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x8_f32m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x8_f32m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x8_f32m1(_val01, 3), _w0, vl);
-                _sum4 = vfmacc_vv_f32m1(_sum4, vget_f32m1x8_f32m1(_val01, 4), _w0, vl);
-                _sum5 = vfmacc_vv_f32m1(_sum5, vget_f32m1x8_f32m1(_val01, 5), _w0, vl);
-                _sum6 = vfmacc_vv_f32m1(_sum6, vget_f32m1x8_f32m1(_val01, 6), _w0, vl);
-                _sum7 = vfmacc_vv_f32m1(_sum7, vget_f32m1x8_f32m1(_val01, 7), _w0, vl);
+                _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl);
+                _sum4 = vfmacc_vv_f32m1(_sum4, _val4, _w0, vl);
+                _sum5 = vfmacc_vv_f32m1(_sum5, _val5, _w0, vl);
+                _sum6 = vfmacc_vv_f32m1(_sum6, _val6, _w0, vl);
+                _sum7 = vfmacc_vv_f32m1(_sum7, _val7, _w0, vl);
                 tmpptr += packn * 8;
                 kptr0 += packn;
             }
@@ -463,12 +486,16 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat32m1x4_t _val01 = vlseg4e32_v_f32m1x4(tmpptr, vl);
+                vfloat32m1_t _val0;
+                vfloat32m1_t _val1;
+                vfloat32m1_t _val2;
+                vfloat32m1_t _val3;
+                vlseg4e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl);
                 vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x4_f32m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x4_f32m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x4_f32m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x4_f32m1(_val01, 3), _w0, vl);
+                _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl);
                 tmpptr += packn * 4;
                 kptr0 += packn;
             }
@@ -519,10 +546,12 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat32m1x2_t _val01 = vlseg2e32_v_f32m1x2(tmpptr, vl);
+                vfloat32m1_t _val0;
+                vfloat32m1_t _val1;
+                vlseg2e32_v_f32m1(&_val0, &_val1, tmpptr, vl);
                 vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x2_f32m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x2_f32m1(_val01, 1), _w0, vl);
+                _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl);
                 tmpptr += packn * 2;
                 kptr0 += packn;
             }
@@ -648,7 +677,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_rvv(const Mat& _k
 static void convolution_im2col_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
index d6dd867397c..925713d9826 100644
--- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator);
 
@@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                     vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl);
                     vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl);
                     vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl);
-                    vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
                     vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl);
                     vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl);
-                    vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 #else
                     vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
-                    vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl);
+                    vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
     int nn_outch = outch / packn;
     int remain_outch_start = nn_outch * packn;
 
+    // make clang happy with the following loop
+#ifdef __clang__
+    __fp16* _zero_tmp = new __fp16[packn]();
+    for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++)
+    {
+        _zero_tmp[_zero_clean_idx] = 0.f;
+    }
+#endif // __clang__
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_outch; pp++)
     {
@@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
         __fp16* outptr0 = top_blob.channel(p);
 
+#ifdef __clang__
+        const __fp16* zeros = _zero_tmp;
+#else
         const __fp16 zeros[packn] = {0.f};
+#endif // __clang__
         const __fp16* biasptr = bias ? bias + p : zeros;
 
         int i = 0;
@@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             vsse16_v_f16m1(outptr0 + 6, top_blob.cstep * sizeof(__fp16), _sum6, vl);
             vsse16_v_f16m1(outptr0 + 7, top_blob.cstep * sizeof(__fp16), _sum7, vl);
 #else
-            vssseg8e16_v_f16m1x8(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl);
+            vssseg8e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl);
 #endif
             outptr0 += 8;
         }
@@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl);
             vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl);
 #else
-            vssseg4e16_v_f16m1x4(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x4(_sum0, _sum1, _sum2, _sum3), vl);
+            vssseg4e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, vl);
 #endif
             outptr0 += 4;
         }
@@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
             vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
 #else
-            vssseg2e16_v_f16m1x2(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x2(_sum0, _sum1), vl);
+            vssseg2e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, vl);
 #endif
             outptr0 += 2;
         }
@@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             outptr0 += 1;
         }
     }
+#ifdef __clang__
+    delete[] _zero_tmp;
+#endif // __clang__
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_outch_start; p < outch; p++)
@@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat16m1x8_t _val01 = vlseg8e16_v_f16m1x8(tmpptr, vl);
+                vfloat16m1_t _val0;
+                vfloat16m1_t _val1;
+                vfloat16m1_t _val2;
+                vfloat16m1_t _val3;
+                vfloat16m1_t _val4;
+                vfloat16m1_t _val5;
+                vfloat16m1_t _val6;
+                vfloat16m1_t _val7;
+                vlseg8e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl);
                 vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x8_f16m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x8_f16m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x8_f16m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x8_f16m1(_val01, 3), _w0, vl);
-                _sum4 = vfmacc_vv_f16m1(_sum4, vget_f16m1x8_f16m1(_val01, 4), _w0, vl);
-                _sum5 = vfmacc_vv_f16m1(_sum5, vget_f16m1x8_f16m1(_val01, 5), _w0, vl);
-                _sum6 = vfmacc_vv_f16m1(_sum6, vget_f16m1x8_f16m1(_val01, 6), _w0, vl);
-                _sum7 = vfmacc_vv_f16m1(_sum7, vget_f16m1x8_f16m1(_val01, 7), _w0, vl);
+                _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);
+                _sum4 = vfmacc_vv_f16m1(_sum4, _val4, _w0, vl);
+                _sum5 = vfmacc_vv_f16m1(_sum5, _val5, _w0, vl);
+                _sum6 = vfmacc_vv_f16m1(_sum6, _val6, _w0, vl);
+                _sum7 = vfmacc_vv_f16m1(_sum7, _val7, _w0, vl);
                 tmpptr += packn * 8;
                 kptr0 += packn;
             }
@@ -463,12 +486,17 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat16m1x4_t _val01 = vlseg4e16_v_f16m1x4(tmpptr, vl);
+                vfloat16m1_t _val0;
+                vfloat16m1_t _val1;
+                vfloat16m1_t _val2;
+                vfloat16m1_t _val3;
+
+                vlseg4e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl);
                 vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x4_f16m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x4_f16m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x4_f16m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x4_f16m1(_val01, 3), _w0, vl);
+                _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);
                 tmpptr += packn * 4;
                 kptr0 += packn;
             }
@@ -519,10 +547,12 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat16m1x2_t _val01 = vlseg2e16_v_f16m1x2(tmpptr, vl);
+                vfloat16m1_t _val0;
+                vfloat16m1_t _val1;
+                vlseg2e16_v_f16m1(&_val0, &_val1, tmpptr, vl);
                 vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x2_f16m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x2_f16m1(_val01, 1), _w0, vl);
+                _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
                 tmpptr += packn * 2;
                 kptr0 += packn;
             }
@@ -648,7 +678,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(const
 static void convolution_im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_winograd_dot.h b/src/layer/riscv/convolution_winograd_dot.h
index 8ea6bc9c576..c0a7b7680f8 100644
--- a/src/layer/riscv/convolution_winograd_dot.h
+++ b/src/layer/riscv/convolution_winograd_dot.h
@@ -16,7 +16,7 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator);
diff --git a/src/layer/riscv/convolution_winograd_dot_packn.h b/src/layer/riscv/convolution_winograd_dot_packn.h
index 434eaa00c68..1c505d5c2e1 100644
--- a/src/layer/riscv/convolution_winograd_dot_packn.h
+++ b/src/layer/riscv/convolution_winograd_dot_packn.h
@@ -15,7 +15,7 @@
 static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u * packn, packn, opt.workspace_allocator);
 
@@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c
                 vfloat32m1_t _val5 = vle32_v_f32m1(r0 + packn * 5, vl);
                 vfloat32m1_t _val6 = vle32_v_f32m1(r0 + packn * 6, vl);
                 vfloat32m1_t _val7 = vle32_v_f32m1(r0 + packn * 7, vl);
-                vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 8;
@@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c
                 vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl);
                 vfloat32m1_t _val2 = vle32_v_f32m1(r0 + packn * 2, vl);
                 vfloat32m1_t _val3 = vle32_v_f32m1(r0 + packn * 3, vl);
-                vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl);
+                vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 4;
@@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c
 #else
                 vfloat32m1_t _val0 = vle32_v_f32m1(r0, vl);
                 vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl);
-                vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl);
+                vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 2;
diff --git a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h
index 0b731519426..ed35ad3e378 100644
--- a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h
+++ b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u * packn, packn, opt.workspace_allocator);
 
@@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o
                 vfloat16m1_t _val5 = vle16_v_f16m1(r0 + packn * 5, vl);
                 vfloat16m1_t _val6 = vle16_v_f16m1(r0 + packn * 6, vl);
                 vfloat16m1_t _val7 = vle16_v_f16m1(r0 + packn * 7, vl);
-                vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 8;
@@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o
                 vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl);
                 vfloat16m1_t _val2 = vle16_v_f16m1(r0 + packn * 2, vl);
                 vfloat16m1_t _val3 = vle16_v_f16m1(r0 + packn * 3, vl);
-                vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl);
+                vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 4;
@@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o
 #else
                 vfloat16m1_t _val0 = vle16_v_f16m1(r0, vl);
                 vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl);
-                vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl);
+                vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 2;
diff --git a/src/layer/riscv/convolution_winograd_transform_packn.h b/src/layer/riscv/convolution_winograd_transform_packn.h
index db3a05aa92f..f5a52970759 100644
--- a/src/layer/riscv/convolution_winograd_transform_packn.h
+++ b/src/layer/riscv/convolution_winograd_transform_packn.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blo
 static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_
 static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blo
 static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_
 static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blo
 static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
diff --git a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h
index b1b1ad9f54d..2404a8a4092 100644
--- a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h
+++ b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot
 static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to
 static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot
 static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to
 static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bot
 static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn.h b/src/layer/riscv/convolutiondepthwise_3x3_packn.h
index d8aa0ec4ee0..0cab1af0802 100644
--- a/src/layer/riscv/convolutiondepthwise_3x3_packn.h
+++ b/src/layer/riscv/convolutiondepthwise_3x3_packn.h
@@ -15,7 +15,7 @@
 static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
@@ -302,7 +302,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M
 static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h
index c3d73053bea..d479385f6a2 100644
--- a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h
+++ b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
@@ -302,7 +302,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn.h b/src/layer/riscv/convolutiondepthwise_5x5_packn.h
index cd35ef8e816..2ef2fea7455 100644
--- a/src/layer/riscv/convolutiondepthwise_5x5_packn.h
+++ b/src/layer/riscv/convolutiondepthwise_5x5_packn.h
@@ -15,7 +15,7 @@
 static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
@@ -336,7 +336,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M
 static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h
index 1647f96db8c..08270e307c9 100644
--- a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h
+++ b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
@@ -336,7 +336,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void convdw5x5s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index e33360e0609..eb39ac0baa7 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -282,7 +282,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -710,7 +710,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -920,7 +920,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
 int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp
index f7b44efd1a1..80e76fc47b4 100644
--- a/src/layer/riscv/crop_riscv.cpp
+++ b/src/layer/riscv/crop_riscv.cpp
@@ -43,7 +43,7 @@ static void crop_packn_rvv(const Mat& src, Mat& dst, int top, int left, int pack
     int h = dst.h;
     int right = src.w - dst.w - left;
 
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const float* ptr = src.row(top) + left * packn;
     float* outptr = dst;
@@ -69,7 +69,7 @@ static void crop_packn_bf16_fp16s_rvv(const Mat& src, Mat& dst, int top, int lef
     int h = dst.h;
     int right = src.w - dst.w - left;
 
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const unsigned short* ptr = src.row<unsigned short>(top) + left * packn;
     unsigned short* outptr = dst;
diff --git a/src/layer/riscv/deconvolution_pack1ton.h b/src/layer/riscv/deconvolution_pack1ton.h
index dfbe8e01a2d..ec18f62c1c6 100644
--- a/src/layer/riscv/deconvolution_pack1ton.h
+++ b/src/layer/riscv/deconvolution_pack1ton.h
@@ -15,7 +15,7 @@
 static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_pack1ton_fp16s.h b/src/layer/riscv/deconvolution_pack1ton_fp16s.h
index a1fcfefc254..168c709217d 100644
--- a/src/layer/riscv/deconvolution_pack1ton_fp16s.h
+++ b/src/layer/riscv/deconvolution_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -103,7 +103,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl
 static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packn.h b/src/layer/riscv/deconvolution_packn.h
index 457e2b95c92..8cab6c3b0a1 100644
--- a/src/layer/riscv/deconvolution_packn.h
+++ b/src/layer/riscv/deconvolution_packn.h
@@ -15,7 +15,7 @@
 static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packn_fp16s.h b/src/layer/riscv/deconvolution_packn_fp16s.h
index 46d52470ad0..62fbd2eb731 100644
--- a/src/layer/riscv/deconvolution_packn_fp16s.h
+++ b/src/layer/riscv/deconvolution_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -105,7 +105,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packnto1.h b/src/layer/riscv/deconvolution_packnto1.h
index ba81baf3676..2efa9b154d2 100644
--- a/src/layer/riscv/deconvolution_packnto1.h
+++ b/src/layer/riscv/deconvolution_packnto1.h
@@ -15,7 +15,7 @@
 static void deconvolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h
index 5cb0a3c49bd..ab70100fb3b 100644
--- a/src/layer/riscv/deconvolution_packnto1_fp16s.h
+++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h
@@ -15,7 +15,7 @@
 static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -116,7 +116,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl
 static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index ab20e6c4148..b53e8962fd2 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -210,7 +210,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob,
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     // convolv with NxN kernel
@@ -518,7 +518,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -739,7 +739,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
 int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/dropout_riscv.cpp b/src/layer/riscv/dropout_riscv.cpp
index fc71db7689a..461edf2d056 100644
--- a/src/layer/riscv/dropout_riscv.cpp
+++ b/src/layer/riscv/dropout_riscv.cpp
@@ -53,7 +53,7 @@ int Dropout_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfmul_vf_f32m8(_p, scale, vl);
diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp
index 325ab6f175d..491c051c7fe 100644
--- a/src/layer/riscv/flatten_riscv.cpp
+++ b/src/layer/riscv/flatten_riscv.cpp
@@ -119,7 +119,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p = vle32_v_f32m1(ptr, vl);
                     vsse32_v_f32m1(outptr, w * sizeof(float), _p, vl);
@@ -147,7 +147,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p = vle32_v_f32m1(ptr, vl);
                     vsse32_v_f32m1(outptr, size * sizeof(float), _p, vl);
@@ -172,7 +172,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
 
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vse32_v_f32m8(outptr, _p, vl);
@@ -262,7 +262,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p = vle16_v_u16m1(ptr, vl);
                     vsse16_v_u16m1(outptr, w * sizeof(unsigned short), _p, vl);
@@ -290,7 +290,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p = vle16_v_u16m1(ptr, vl);
                     vsse16_v_u16m1(outptr, size * sizeof(unsigned short), _p, vl);
@@ -315,7 +315,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
 
                     vuint16m8_t _p = vle16_v_u16m8(ptr, vl);
                     vse16_v_u16m8(outptr, _p, vl);
@@ -405,7 +405,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e8m1(n);
+                    size_t vl = vsetvl_e8m1(n);
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl);
@@ -433,7 +433,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e8m1(n);
+                    size_t vl = vsetvl_e8m1(n);
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl);
@@ -458,7 +458,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e8m8(n);
+                    size_t vl = vsetvl_e8m8(n);
 
                     vint8m8_t _p = vle8_v_i8m8(ptr, vl);
                     vse8_v_i8m8(outptr, _p, vl);
diff --git a/src/layer/riscv/gelu_riscv.cpp b/src/layer/riscv/gelu_riscv.cpp
index 708e951e5a3..69b374998f3 100644
--- a/src/layer/riscv/gelu_riscv.cpp
+++ b/src/layer/riscv/gelu_riscv.cpp
@@ -48,7 +48,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m4(n);
+                size_t vl = vsetvl_e32m4(n);
 
                 vfloat32m4_t _p = vle32_v_f32m4(ptr, vl);
 
@@ -77,7 +77,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 auto _p = vle32_v_f32m8(ptr, vl);
                 auto _perfc = vfmul_vf_f32m8(_p, -.70710678f, vl);
                 _p = vfmul_vf_f32m8(_p, .5f, vl);
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
index e45d37592ef..28afa5081d0 100644
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -63,7 +63,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_xcu = weight_xc_U;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _x = vle32_v_f32m8(ptr_x, vl);
                 vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl);
                 vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl);
@@ -93,7 +93,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_hcu = weight_hc_U;
             while (n_out > 0)
             {
-                word_type vl = vsetvl_e32m8(n_out);
+                size_t vl = vsetvl_e32m8(n_out);
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl);
                 vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl);
                 vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl);
@@ -136,7 +136,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_whc_n = weight_hc_N;
             while (n_out2 > 0)
             {
-                word_type vl = vsetvl_e32m8(n_out2);
+                size_t vl = vsetvl_e32m8(n_out2);
 
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl);
                 vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl);
@@ -160,7 +160,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_xcn = weight_xc_N;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e32m8(n2);
+                size_t vl = vsetvl_e32m8(n2);
 
                 vfloat32m8_t _x = vle32_v_f32m8(ptr_x2, vl);
                 vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl);
@@ -428,7 +428,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_xcu = weight_xc_U;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x, vl), vl);
                 vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl);
                 vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl);
@@ -458,7 +458,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_hcu = weight_hc_U;
             while (n_out > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out);
+                size_t vl = vsetvl_e16m4(n_out);
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl);
                 vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl);
                 vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl);
@@ -501,7 +501,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_whc_n = weight_hc_N;
             while (n_out2 > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out2);
+                size_t vl = vsetvl_e16m4(n_out2);
 
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl);
                 vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl);
@@ -525,7 +525,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_xcn = weight_xc_N;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e16m4(n2);
+                size_t vl = vsetvl_e16m4(n2);
 
                 vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x2, vl), vl);
                 vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl);
@@ -758,7 +758,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_xcu = weight_xc_U;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
                 vfloat16m8_t _x = vle16_v_f16m8(ptr_x, vl);
                 vfloat16m8_t _xcr = vle16_v_f16m8(ptr_xcr, vl);
                 vfloat16m8_t _xcu = vle16_v_f16m8(ptr_xcu, vl);
@@ -785,7 +785,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_hcu = weight_hc_U;
             while (n_out > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out);
+                size_t vl = vsetvl_e16m4(n_out);
                 vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc, vl), vl);
                 vfloat16m4_t _hcr = vle16_v_f16m4(ptr_hcr, vl);
                 vfloat16m4_t _hcu = vle16_v_f16m4(ptr_hcu, vl);
@@ -825,7 +825,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_whc_n = weight_hc_N;
             while (n_out2 > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out2);
+                size_t vl = vsetvl_e16m4(n_out2);
 
                 vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc2, vl), vl);
                 vfloat16m4_t _whc_n = vle16_v_f16m4(ptr_whc_n, vl);
@@ -846,7 +846,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_xcn = weight_xc_N;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e16m8(n2);
+                size_t vl = vsetvl_e16m8(n2);
 
                 vfloat16m8_t _x = vle16_v_f16m8(ptr_x2, vl);
                 vfloat16m8_t _xcn = vle16_v_f16m8(ptr_xcn, vl);
diff --git a/src/layer/riscv/hardsigmoid_riscv.cpp b/src/layer/riscv/hardsigmoid_riscv.cpp
index 2c3bbec2886..112a1c9c8d2 100644
--- a/src/layer/riscv/hardsigmoid_riscv.cpp
+++ b/src/layer/riscv/hardsigmoid_riscv.cpp
@@ -60,7 +60,7 @@ int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
             vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl);
@@ -111,7 +111,7 @@ int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option&
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
 
             vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl);
diff --git a/src/layer/riscv/hardswish_riscv.cpp b/src/layer/riscv/hardswish_riscv.cpp
index b60197115ca..5d68e07b06a 100644
--- a/src/layer/riscv/hardswish_riscv.cpp
+++ b/src/layer/riscv/hardswish_riscv.cpp
@@ -60,7 +60,7 @@ int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
             vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl);
@@ -111,7 +111,7 @@ int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
 
             vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl);
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index 721c6361b8b..30dd7428777 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -198,7 +198,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 #if __riscv_vector
             if (elempack == packn && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 float* outptr = top_blob.row(j);
 
@@ -237,7 +237,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 
             if (elempack == 1 && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 float* outptr = top_blob.row(j);
 
@@ -273,7 +273,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 
             if (elempack == packn && num_output_elempack == 1)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 float* outptr = top_blob.row(j);
 
@@ -372,7 +372,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < num_output / out_elempack; p++)
         {
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
             vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
 
             if (bias_term)
@@ -414,7 +414,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
         {
             int p = pp * packn;
 
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
             vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
 
             if (bias_term)
@@ -595,7 +595,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
         {
             if (elempack == packn && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -635,7 +635,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
 
             if (elempack == 1 && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -672,7 +672,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
 
             if (elempack == packn && num_output_elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -765,7 +765,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < num_output / out_elempack; p++)
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
             vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
 
             if (bias_term)
@@ -857,7 +857,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
         {
             if (elempack == packn && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -897,7 +897,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 
             if (elempack == 1 && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -934,7 +934,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 
             if (elempack == packn && num_output_elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -1027,7 +1027,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < num_output / out_elempack; p++)
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
             vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
 
             if (bias_term)
diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp
new file mode 100644
index 00000000000..c13cf261220
--- /dev/null
+++ b/src/layer/riscv/instancenorm_riscv.cpp
@@ -0,0 +1,514 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "instancenorm_riscv.h"
+
+#include <math.h>
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+
+namespace ncnn {
+InstanceNorm_riscv::InstanceNorm_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#if __riscv_zfh
+    support_fp16_storage = true;
+#endif
+#endif // __riscv_vector
+}
+
+int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+// x = (x - mean) / (sqrt(var + eps)) * gamma + beta
+#if __riscv_vector
+    int elembits = bottom_top_blob.elembits();
+    if (opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_inplace_fp16sa(bottom_top_blob, opt);
+        else
+            return forward_inplace_fp16s(bottom_top_blob, opt);
+    }
+    int elempack = bottom_top_blob.elempack;
+#endif // __riscv_vector
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int c = bottom_top_blob.c;
+    int size = w * h;
+
+    int dims = bottom_top_blob.dims;
+#if __riscv_vector
+    if (elempack == 1)
+#endif // __riscv_vector
+    {
+#if __riscv_vector
+        size = elempack * size;
+#endif
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            // mean and var
+            float sum = 0.f;
+            float sqsum = 0.f;
+#if __riscv_vector
+            vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+            vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+            {
+                int n = size;
+                float* ptr_sum = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl);
+                    _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+                    // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    ptr_sum += vl;
+                    n -= vl;
+                }
+            }
+            sum = vfmv_f_s_f32m1_f32(_sum);
+#else
+            for (int i = 0; i < size; i++)
+            {
+                sum += ptr[i];
+                //sqsum += ptr[i] * ptr[i];
+            }
+#endif // __riscv_vector
+            float mean = sum / size;
+#if __riscv_vecotr
+            {
+                int n = size;
+                float* ptr_sqsum = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl);
+                    _p = vfsub_vf_f32m8(_p, mean, vl);
+                    _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    n -= vl;
+                    ptr_sqsum += vl;
+                }
+            }
+            sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+#else
+            float tmp = 0.f;
+            for (int i = 0; i < size; i++)
+            {
+                tmp = ptr[i] - mean;
+                sqsum += tmp * tmp;
+            }
+#endif // __riscv_vector
+            float var = sqsum / size;
+            // the var maybe minus due to accuracy
+            //float var = sqsum / size - mean * mean;
+
+            float a;
+            float b;
+            if (affine)
+            {
+                float gamma = gamma_data[q];
+                float beta = beta_data[q];
+
+                a = static_cast<float>(gamma / (sqrt(var + eps)));
+                b = -mean * a + beta;
+            }
+            else
+            {
+                a = static_cast<float>(1.f / (sqrt(var + eps)));
+                b = -mean * a;
+            }
+#if __riscv_vector
+            {
+                int n = size;
+                float* ptr_store = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl);
+                    _p = vfmul_vf_f32m8(_p, a, vl);
+                    _p = vfadd_vf_f32m8(_p, b, vl);
+                    vse32_v_f32m8(ptr_store, _p, vl);
+                    n -= vl;
+                    ptr_store += vl;
+                }
+            }
+#else
+            for (int i = 0; i < size; i++)
+            {
+                ptr[i] = ptr[i] * a + b;
+            }
+#endif // __riscv_vector
+        }
+        return 0;
+    }
+
+#if __riscv_vector
+    const int packn = csrr_vlenb() / 4;
+    if (elempack == packn)
+    {
+        const size_t vl = vsetvl_e32m1(packn);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
+            vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vl);
+
+            for (int i = 0; i < size; i++)
+            {
+                vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
+                _sum = vfadd_vv_f32m1(_p, _sum, vl);
+                // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl);
+            }
+            vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl);
+            for (int i = 0; i < size; i++)
+            {
+                vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl);
+                _p = vfsub_vv_f32m1(_p, _mean, vl);
+                _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl);
+            }
+            vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl);
+            // the var maybe minus due to accuracy
+            //float var = sqsum / size - mean * mean;
+
+            vfloat32m1_t _a;
+            vfloat32m1_t _b;
+            if (affine)
+            {
+                vfloat32m1_t _gamma = vle32_v_f32m1((const float*)gamma_data + q * vl, vl);
+                vfloat32m1_t _beta = vle32_v_f32m1((const float*)beta_data + q * vl, vl);
+                _a = vfdiv_vv_f32m1(_gamma, vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), vl);
+                _b = vfnmsub_vv_f32m1(_a, _mean, _beta, vl);
+            }
+            else
+            {
+                _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl);
+                _b = vfmul_vv_f32m1(_a, _mean, vl);
+                _b = vfsgnjn_vv_f32m1(_b, _b, vl);
+            }
+            for (int i = 0; i < size; i++)
+            {
+                vfloat32m1_t _p = vle32_v_f32m1(ptr + i * vl, vl);
+                _p = vfmadd_vv_f32m1(_p, _a, _b, vl);
+                vse32_v_f32m1(ptr + i * vl, _p, vl);
+            }
+        }
+        return 0;
+    }
+#endif // __riscv_vector
+    return 0;
+}
+
+#if __riscv_vector && __riscv_zfh
+int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    // x = (x - mean) / (sqrt(var + eps)) * gamma + beta
+
+    int elempack = bottom_top_blob.elempack;
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int c = bottom_top_blob.c;
+    int size = w * h;
+
+    int dims = bottom_top_blob.dims;
+    if (elempack == 1)
+    {
+        size = elempack * size;
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            __fp16* ptr = bottom_top_blob.channel(q);
+
+            // mean and var
+            float sum = 0.f;
+            float sqsum = 0.f;
+            vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+            vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1());
+            {
+                int n = size;
+                __fp16* ptr_sum = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl);
+                    _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl);
+                    // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    ptr_sum += vl;
+                    n -= vl;
+                }
+            }
+            sum = vfmv_f_s_f32m1_f32(_sum);
+            float mean = sum / size;
+            {
+                int n = size;
+                __fp16* ptr_sqsum = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl);
+                    _p = vfsub_vf_f32m8(_p, mean, vl);
+                    _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    n -= vl;
+                    ptr_sqsum += vl;
+                }
+            }
+            sqsum = vfmv_f_s_f32m1_f32(_sqsum);
+            float var = sqsum / size;
+            // the var maybe minus due to accuracy
+            //float var = sqsum / size - mean * mean;
+
+            float a;
+            float b;
+            if (affine)
+            {
+                float gamma = gamma_data[q];
+                float beta = beta_data[q];
+
+                a = static_cast<float>(gamma / (sqrt(var + eps)));
+                b = -mean * a + beta;
+            }
+            else
+            {
+                a = static_cast<float>(1.f / (sqrt(var + eps)));
+                b = -mean * a;
+            }
+            {
+                int n = size;
+                __fp16* ptr_store = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl);
+                    _p = vfmul_vf_f32m8(_p, a, vl);
+                    _p = vfadd_vf_f32m8(_p, b, vl);
+                    vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl);
+                    n -= vl;
+                    ptr_store += vl;
+                }
+            }
+        }
+        return 0;
+    }
+
+    const int packn = csrr_vlenb() / 2;
+    if (elempack == packn)
+    {
+        const size_t vl = vsetvl_e16m1(packn);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            __fp16* ptr = bottom_top_blob.channel(q);
+            vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
+            vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl);
+
+            for (int i = 0; i < size; i++)
+            {
+                vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
+                _sum = vfadd_vv_f32m2(_p, _sum, vl);
+                // _sqsum = vfmadd_vv_f32m2(_p,_p,_sqsum,vl);
+            }
+            vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, size, vl);
+            for (int i = 0; i < size; i++)
+            {
+                vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl);
+                _p = vfsub_vv_f32m2(_p, _mean, vl);
+                _sqsum = vfmadd_vv_f32m2(_p, _p, _sqsum, vl);
+            }
+            vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, size, vl);
+            // the var maybe minus due to accuracy
+            //float var = sqsum / size - mean * mean;
+
+            vfloat32m2_t _a;
+            vfloat32m2_t _b;
+            if (affine)
+            {
+                vfloat32m2_t _gamma = vle32_v_f32m2((const float*)gamma_data + q * vl, vl);
+                vfloat32m2_t _beta = vle32_v_f32m2((const float*)beta_data + q * vl, vl);
+                _a = vfdiv_vv_f32m2(_gamma, vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), vl);
+                _b = vfnmsub_vv_f32m2(_a, _mean, _beta, vl);
+            }
+            else
+            {
+                _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl);
+                _b = vfmul_vv_f32m2(_a, _mean, vl);
+                _b = vfsgnjn_vv_f32m2(_b, _b, vl);
+            }
+            for (int i = 0; i < size; i++)
+            {
+                vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + i * vl, vl), vl);
+                _p = vfmadd_vv_f32m2(_p, _a, _b, vl);
+                vse16_v_f16m1(ptr + i * vl, vfncvt_f_f_w_f16m1(_p, vl), vl);
+            }
+        }
+        return 0;
+    }
+    return 0;
+}
+
+int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
+{
+    // x = (x - mean) / (sqrt(var + eps)) * gamma + beta
+    int elempack = bottom_top_blob.elempack;
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int c = bottom_top_blob.c;
+    int size = w * h;
+
+    int dims = bottom_top_blob.dims;
+    if (elempack == 1)
+    {
+        size = elempack * size;
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            __fp16* ptr = bottom_top_blob.channel(q);
+
+            // mean and var
+            __fp16 sum = 0.f;
+            __fp16 sqsum = 0.f;
+            vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
+            vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1());
+            {
+                int n = size;
+                __fp16* ptr_sum = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e16m8(n);
+                    vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl);
+                    _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl);
+                    // _sqsum = vfredosum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    ptr_sum += vl;
+                    n -= vl;
+                }
+            }
+            sum = vfmv_f_s_f16m1_f16(_sum);
+            __fp16 mean = sum / size;
+            {
+                int n = size;
+                __fp16* ptr_sqsum = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e16m8(n);
+                    vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl);
+                    _p = vfsub_vf_f16m8(_p, mean, vl);
+                    _sqsum = vfredosum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl);
+                    n -= vl;
+                    ptr_sqsum += vl;
+                }
+            }
+            sqsum = vfmv_f_s_f16m1_f16(_sqsum);
+            __fp16 var = sqsum / size;
+            // the var maybe minus due to accuracy
+            //float var = sqsum / size - mean * mean;
+
+            __fp16 a;
+            __fp16 b;
+            if (affine)
+            {
+                float gamma = gamma_data[q];
+                float beta = beta_data[q];
+
+                a = static_cast<__fp16>(gamma / (sqrt(var + eps)));
+                b = static_cast<__fp16>(-mean * a + beta);
+            }
+            else
+            {
+                a = static_cast<__fp16>(1.f / (sqrt(var + eps)));
+                b = static_cast<__fp16>(-mean * a);
+            }
+            {
+                int n = size;
+                __fp16* ptr_store = ptr;
+                while (n > 0)
+                {
+                    size_t vl = vsetvl_e32m8(n);
+                    vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl);
+                    _p = vfmul_vf_f16m8(_p, a, vl);
+                    _p = vfadd_vf_f16m8(_p, b, vl);
+                    vse16_v_f16m8(ptr_store, _p, vl);
+                    n -= vl;
+                    ptr_store += vl;
+                }
+            }
+        }
+        return 0;
+    }
+
+    const int packn = csrr_vlenb() / 2;
+    if (elempack == packn)
+    {
+        const size_t vl = vsetvl_e16m1(packn);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            __fp16* ptr = bottom_top_blob.channel(q);
+            vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
+            vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl);
+
+            for (int i = 0; i < size; i++)
+            {
+                vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
+                _sum = vfadd_vv_f16m1(_p, _sum, vl);
+                // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl);
+            }
+            vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl);
+            for (int i = 0; i < size; i++)
+            {
+                vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl);
+                _p = vfsub_vv_f16m1(_p, _mean, vl);
+                _sqsum = vfmadd_vv_f16m1(_p, _p, _sqsum, vl);
+            }
+            vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl);
+            // the var maybe minus due to accuracy
+            //float var = sqsum / size - mean * mean;
+
+            vfloat16m1_t _a;
+            vfloat16m1_t _b;
+            if (affine)
+            {
+                vfloat16m1_t _gamma = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)gamma_data + q * vl, vl), vl);
+                vfloat16m1_t _beta = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)beta_data + q * vl, vl), vl);
+                _a = vfdiv_vv_f16m1(_gamma, vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), vl);
+                _b = vfnmsub_vv_f16m1(_a, _mean, _beta, vl);
+            }
+            else
+            {
+                _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl);
+                _b = vfmul_vv_f16m1(_a, _mean, vl);
+                _b = vfsgnjn_vv_f16m1(_b, _b, vl);
+            }
+            for (int i = 0; i < size; i++)
+            {
+                vfloat16m1_t _p = vle16_v_f16m1(ptr + i * vl, vl);
+                _p = vfmadd_vv_f16m1(_p, _a, _b, vl);
+                vse16_v_f16m1(ptr + i * vl, _p, vl);
+            }
+        }
+        return 0;
+    }
+    return 0;
+}
+
+#endif // __riscv_vector && __riscv_zfh
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h
new file mode 100644
index 00000000000..80583cc2c89
--- /dev/null
+++ b/src/layer/riscv/instancenorm_riscv.h
@@ -0,0 +1,36 @@
+// Xavier Hsinyuan is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 Xavier Hsinyuan <me@lstlx.com>. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INSTANCENORM_RISCV_H
+#define LAYER_INSTANCENORM_RISCV_H
+
+#include "instancenorm.h"
+
+namespace ncnn {
+class InstanceNorm_riscv : virtual public InstanceNorm
+{
+public:
+    InstanceNorm_riscv();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if __riscv_vector && __riscv_zfh
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+};
+} // namespace ncnn
+
+#endif // LAYER_INSTANCENORM_RISCV_H
\ No newline at end of file
diff --git a/src/layer/riscv/interp_bicubic_packn.h b/src/layer/riscv/interp_bicubic_packn.h
index 16ed365ff53..4c4eb869c43 100644
--- a/src/layer/riscv/interp_bicubic_packn.h
+++ b/src/layer/riscv/interp_bicubic_packn.h
@@ -15,7 +15,7 @@
 static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h
index b83a9eba3c6..ff2284552b7 100644
--- a/src/layer/riscv/interp_bicubic_packn_fp16s.h
+++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
@@ -244,7 +244,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al
 static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_bilinear.h b/src/layer/riscv/interp_bilinear.h
index 1742626017a..0f6338d7310 100644
--- a/src/layer/riscv/interp_bilinear.h
+++ b/src/layer/riscv/interp_bilinear.h
@@ -86,16 +86,17 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             int n = w;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m4(n);
+                size_t vl = vsetvl_e32m4(n);
 
                 vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl);
-                vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl);
-                vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0);
-                vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1);
 
-                vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl);
-                vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0);
-                vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1);
+                vfloat32m4_t _S1p0;
+                vfloat32m4_t _S1p1;
+                vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl);
+
+                vfloat32m4_t _a0;
+                vfloat32m4_t _a1;
+                vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl);
 
                 vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl);
 
@@ -135,19 +136,21 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             int n = w;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m4(n);
+                size_t vl = vsetvl_e32m4(n);
 
                 vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl);
-                vfloat32m4x2_t _S0p = vloxseg2ei32_v_f32m4x2(S0, _sx, vl);
-                vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl);
-                vfloat32m4_t _S0p0 = vget_f32m4x2_f32m4(_S0p, 0);
-                vfloat32m4_t _S0p1 = vget_f32m4x2_f32m4(_S0p, 1);
-                vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0);
-                vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1);
 
-                vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl);
-                vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0);
-                vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1);
+                vfloat32m4_t _S0p0;
+                vfloat32m4_t _S0p1;
+                vfloat32m4_t _S1p0;
+                vfloat32m4_t _S1p1;
+
+                vloxseg2ei32_v_f32m4(&_S0p0, &_S0p1, S0, _sx, vl);
+                vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl);
+
+                vfloat32m4_t _a0;
+                vfloat32m4_t _a1;
+                vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl);
 
                 vfloat32m4_t _rows0 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S0p0, _a0, vl), _S0p1, _a1, vl);
                 vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl);
@@ -192,7 +195,7 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
         int n = w;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl);
             vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl);
diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h
index 091e86b7301..cd61af6efac 100644
--- a/src/layer/riscv/interp_bilinear_fp16s.h
+++ b/src/layer/riscv/interp_bilinear_fp16s.h
@@ -131,7 +131,7 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha,
         int n = w;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl);
             vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl);
@@ -232,7 +232,7 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha
         int n = w;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _rows0 = vle16_v_f16m8(rows0p, vl);
             vfloat16m8_t _rows1 = vle16_v_f16m8(rows1p, vl);
diff --git a/src/layer/riscv/interp_bilinear_packn.h b/src/layer/riscv/interp_bilinear_packn.h
index 0d800e324cb..9dffc01bf30 100644
--- a/src/layer/riscv/interp_bilinear_packn.h
+++ b/src/layer/riscv/interp_bilinear_packn.h
@@ -15,7 +15,7 @@
 static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h
index b48fd8431a4..dfe02c00d1b 100644
--- a/src/layer/riscv/interp_bilinear_packn_fp16s.h
+++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
@@ -122,7 +122,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a
 static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp
index b72cfd00280..ea8344985ed 100644
--- a/src/layer/riscv/interp_riscv.cpp
+++ b/src/layer/riscv/interp_riscv.cpp
@@ -88,7 +88,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 #if __riscv_vector
         if (elempack == packn)
         {
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
 
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < w; q++)
@@ -130,7 +130,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
         {
             if (resize_type == 1) // nearest
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 const float ws = output_width ? w / (float)outw : 1.f / width_scale;
 
@@ -153,7 +153,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 
             if (resize_type == 2) // bilinear
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -190,7 +190,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 
             if (resize_type == 3) // bicubic
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -328,7 +328,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
     {
         if (resize_type == 1) // nearest
         {
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
 
             const float hs = output_height ? h / (float)outh : 1.f / height_scale;
             const float ws = output_width ? w / (float)outw : 1.f / width_scale;
@@ -518,7 +518,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
         if (elempack == packn)
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
 
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < w; q++)
@@ -558,7 +558,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
         {
             if (resize_type == 1) // nearest
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 const float ws = output_width ? w / (float)outw : 1.f / width_scale;
 
@@ -581,7 +581,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
             if (resize_type == 2) // bilinear
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -618,7 +618,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
             if (resize_type == 3) // bicubic
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -754,7 +754,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
     {
         if (resize_type == 1) // nearest
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
 
             const float hs = output_height ? h / (float)outh : 1.f / height_scale;
             const float ws = output_width ? w / (float)outw : 1.f / width_scale;
@@ -955,7 +955,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
         {
             if (resize_type == 2) // bilinear
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -992,7 +992,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
 
             if (resize_type == 3) // bicubic
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp
index abee1ec3748..4ddb1470006 100644
--- a/src/layer/riscv/mish_riscv.cpp
+++ b/src/layer/riscv/mish_riscv.cpp
@@ -64,7 +64,7 @@ int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
@@ -103,7 +103,7 @@ int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
@@ -134,7 +134,7 @@ int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfmul_vv_f16m8(_p, tanh_ps(log_ps(vfadd_vf_f16m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp
index 1805c2469eb..5c298da522d 100644
--- a/src/layer/riscv/packing_riscv.cpp
+++ b/src/layer/riscv/packing_riscv.cpp
@@ -18,6 +18,8 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "riscv_usability.h"
+
 namespace ncnn {
 
 Packing_riscv::Packing_riscv()
@@ -137,13 +139,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
+                    size_t vl = vsetvl_e32m2(n);
 
                     vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
                     vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
                     vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
                     vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
-                    vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -181,13 +183,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
+                    size_t vl = vsetvl_e32m2(n);
+
+                    vfloat32m2_t _p0;
+                    vfloat32m2_t _p1;
+                    vfloat32m2_t _p2;
+                    vfloat32m2_t _p3;
+                    vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
 
-                    vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
-                    vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
-                    vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
-                    vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
-                    vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
+                    vse32_v_f32m2(outptr0, _p0, vl);
+                    vse32_v_f32m2(outptr1, _p1, vl);
+                    vse32_v_f32m2(outptr2, _p2, vl);
+                    vse32_v_f32m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -229,7 +236,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl);
                     vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl);
@@ -239,7 +246,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                     vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl);
                     vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl);
                     vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -289,17 +296,25 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl);
-                    vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl);
-                    vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl);
-                    vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl);
-                    vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl);
-                    vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl);
-                    vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl);
-                    vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vse32_v_f32m1(outptr0, _p0, vl);
+                    vse32_v_f32m1(outptr1, _p1, vl);
+                    vse32_v_f32m1(outptr2, _p2, vl);
+                    vse32_v_f32m1(outptr3, _p3, vl);
+                    vse32_v_f32m1(outptr4, _p4, vl);
+                    vse32_v_f32m1(outptr5, _p5, vl);
+                    vse32_v_f32m1(outptr6, _p6, vl);
+                    vse32_v_f32m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -343,19 +358,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl);
-                    vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl);
-                    vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0);
-                    vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1);
-                    vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2);
-                    vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3);
-                    vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0);
-                    vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1);
-                    vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2);
-                    vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p00;
+                    vfloat32m1_t _p01;
+                    vfloat32m1_t _p02;
+                    vfloat32m1_t _p03;
+                    vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
+
+                    vfloat32m1_t _p10;
+                    vfloat32m1_t _p11;
+                    vfloat32m1_t _p12;
+                    vfloat32m1_t _p13;
+                    vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
+
+                    vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -395,19 +412,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0);
-                    vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1);
-                    vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2);
-                    vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3);
-                    vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4);
-                    vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5);
-                    vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6);
-                    vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7);
-                    vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
@@ -466,13 +483,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
+                    size_t vl = vsetvl_e32m2(n);
 
                     vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
                     vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
                     vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
                     vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
-                    vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -510,13 +527,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
-
-                    vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
-                    vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
-                    vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
-                    vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
-                    vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
+                    size_t vl = vsetvl_e32m2(n);
+                    vfloat32m2_t _p0;
+                    vfloat32m2_t _p1;
+                    vfloat32m2_t _p2;
+                    vfloat32m2_t _p3;
+                    vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
+                    vse32_v_f32m2(outptr0, _p0, vl);
+                    vse32_v_f32m2(outptr1, _p1, vl);
+                    vse32_v_f32m2(outptr2, _p2, vl);
+                    vse32_v_f32m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -558,7 +578,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl);
                     vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl);
@@ -568,7 +588,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                     vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl);
                     vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl);
                     vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -618,17 +638,26 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl);
-                    vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl);
-                    vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl);
-                    vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl);
-                    vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl);
-                    vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl);
-                    vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl);
-                    vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vse32_v_f32m1(outptr0, _p0, vl);
+                    vse32_v_f32m1(outptr1, _p1, vl);
+                    vse32_v_f32m1(outptr2, _p2, vl);
+                    vse32_v_f32m1(outptr3, _p3, vl);
+                    vse32_v_f32m1(outptr4, _p4, vl);
+                    vse32_v_f32m1(outptr5, _p5, vl);
+                    vse32_v_f32m1(outptr6, _p6, vl);
+                    vse32_v_f32m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -672,20 +701,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
-                    vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl);
-                    vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl);
+                    vfloat32m1_t _p00;
+                    vfloat32m1_t _p01;
+                    vfloat32m1_t _p02;
+                    vfloat32m1_t _p03;
+                    vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
 
-                    vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0);
-                    vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1);
-                    vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2);
-                    vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3);
-                    vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0);
-                    vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1);
-                    vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2);
-                    vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    vfloat32m1_t _p10;
+                    vfloat32m1_t _p11;
+                    vfloat32m1_t _p12;
+                    vfloat32m1_t _p13;
+                    vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
+
+                    vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -725,19 +755,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0);
-                    vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1);
-                    vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2);
-                    vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3);
-                    vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4);
-                    vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5);
-                    vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6);
-                    vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7);
-                    vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
@@ -859,13 +889,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
+                    size_t vl = vsetvl_e16m2(n);
 
                     vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
                     vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
                     vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
                     vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
-                    vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -903,13 +933,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
-
-                    vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
-                    vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
-                    vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
-                    vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
-                    vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
+                    size_t vl = vsetvl_e16m2(n);
+
+                    vuint16m2_t _p0;
+                    vuint16m2_t _p1;
+                    vuint16m2_t _p2;
+                    vuint16m2_t _p3;
+                    vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
+                    vse16_v_u16m2(outptr0, _p0, vl);
+                    vse16_v_u16m2(outptr1, _p1, vl);
+                    vse16_v_u16m2(outptr2, _p2, vl);
+                    vse16_v_u16m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -951,7 +985,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
                     vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
@@ -961,7 +995,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                     vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
                     vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
                     vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -1011,17 +1045,26 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
-                    vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
-                    vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
-                    vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
-                    vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
-                    vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
-                    vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
-                    vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vse16_v_u16m1(outptr0, _p0, vl);
+                    vse16_v_u16m1(outptr1, _p1, vl);
+                    vse16_v_u16m1(outptr2, _p2, vl);
+                    vse16_v_u16m1(outptr3, _p3, vl);
+                    vse16_v_u16m1(outptr4, _p4, vl);
+                    vse16_v_u16m1(outptr5, _p5, vl);
+                    vse16_v_u16m1(outptr6, _p6, vl);
+                    vse16_v_u16m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -1065,19 +1108,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
-                    vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
-                    vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
-                    vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
-                    vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
-                    vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
-                    vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
-                    vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
-                    vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
-                    vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p00;
+                    vuint16m1_t _p01;
+                    vuint16m1_t _p02;
+                    vuint16m1_t _p03;
+                    vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
+
+                    vuint16m1_t _p10;
+                    vuint16m1_t _p11;
+                    vuint16m1_t _p12;
+                    vuint16m1_t _p13;
+                    vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
+
+                    vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -1117,19 +1162,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
-                    vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
-                    vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
-                    vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
-                    vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
-                    vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
-                    vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
-                    vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
-                    vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
@@ -1188,13 +1234,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
+                    size_t vl = vsetvl_e16m2(n);
 
                     vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
                     vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
                     vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
                     vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
-                    vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -1232,13 +1278,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
-
-                    vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
-                    vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
-                    vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
-                    vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
-                    vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
+                    size_t vl = vsetvl_e16m2(n);
+
+                    vuint16m2_t _p0;
+                    vuint16m2_t _p1;
+                    vuint16m2_t _p2;
+                    vuint16m2_t _p3;
+                    vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
+                    vse16_v_u16m2(outptr0, _p0, vl);
+                    vse16_v_u16m2(outptr1, _p1, vl);
+                    vse16_v_u16m2(outptr2, _p2, vl);
+                    vse16_v_u16m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -1280,7 +1330,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
                     vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
@@ -1290,7 +1340,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                     vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
                     vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
                     vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -1340,17 +1390,25 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
-                    vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
-                    vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
-                    vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
-                    vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
-                    vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
-                    vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
-                    vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vse16_v_u16m1(outptr0, _p0, vl);
+                    vse16_v_u16m1(outptr1, _p1, vl);
+                    vse16_v_u16m1(outptr2, _p2, vl);
+                    vse16_v_u16m1(outptr3, _p3, vl);
+                    vse16_v_u16m1(outptr4, _p4, vl);
+                    vse16_v_u16m1(outptr5, _p5, vl);
+                    vse16_v_u16m1(outptr6, _p6, vl);
+                    vse16_v_u16m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -1394,20 +1452,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p00;
+                    vuint16m1_t _p01;
+                    vuint16m1_t _p02;
+                    vuint16m1_t _p03;
+                    vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
 
-                    vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
-                    vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
+                    vuint16m1_t _p10;
+                    vuint16m1_t _p11;
+                    vuint16m1_t _p12;
+                    vuint16m1_t _p13;
+                    vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
 
-                    vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
-                    vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
-                    vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
-                    vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
-                    vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
-                    vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
-                    vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
-                    vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -1447,19 +1506,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
-                    vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
-                    vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
-                    vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
-                    vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
-                    vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
-                    vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
-                    vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
-                    vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h
index 1f93ecfe92d..50f5efe1216 100644
--- a/src/layer/riscv/padding_packn.h
+++ b/src/layer/riscv/padding_packn.h
@@ -16,7 +16,7 @@
     static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \
     {                                                                                                                                       \
         const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
-        const word_type vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                 \
+        const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
         T* outptr = dst;                                                                                                                    \
@@ -65,7 +65,7 @@
     static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                      \
     {                                                                                                                                       \
         const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
-        const word_type vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                 \
+        const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
         T* outptr = dst;                                                                                                                    \
@@ -144,7 +144,7 @@
     static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                        \
     {                                                                                                                                       \
         const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
-        const word_type vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                 \
+        const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
         T* outptr = dst;                                                                                                                    \
diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp
index de29af0f6bf..8f4b54da590 100644
--- a/src/layer/riscv/padding_riscv.cpp
+++ b/src/layer/riscv/padding_riscv.cpp
@@ -91,7 +91,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -261,7 +261,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -511,7 +511,7 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 1;
-    const word_type vl = vsetvl_e8m1(packn);
+    const size_t vl = vsetvl_e8m1(packn);
 #endif
 
     int w = bottom_blob.w;
diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp
index 0ca4e3d894c..1b4c1f0ed8a 100644
--- a/src/layer/riscv/pooling_riscv.cpp
+++ b/src/layer/riscv/pooling_riscv.cpp
@@ -72,7 +72,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -315,7 +315,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
     // avg value in NxN window
 
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -721,7 +721,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
     }
 
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/prelu_riscv.cpp b/src/layer/riscv/prelu_riscv.cpp
index c25223461a1..32cb77023b4 100644
--- a/src/layer/riscv/prelu_riscv.cpp
+++ b/src/layer/riscv/prelu_riscv.cpp
@@ -63,7 +63,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
                 vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
@@ -84,7 +84,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -115,7 +115,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
 
@@ -135,7 +135,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -170,7 +170,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     const float* slope_ptr = (const float*)slope_data + q * elempack;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n1);
+                        size_t vl = vsetvl_e32m8(n1);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl);
 
@@ -191,7 +191,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
@@ -303,7 +303,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
 
                 vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                 vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
@@ -324,7 +324,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                 vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -355,7 +355,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
 
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n);
+                        size_t vl = vsetvl_e16m4(n);
                         vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
 
@@ -375,7 +375,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m4(n);
+                    size_t vl = vsetvl_e16m4(n);
                     vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -410,7 +410,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     const float* slope_ptr = (const float*)slope_data + q * elempack;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n1);
+                        size_t vl = vsetvl_e16m4(n1);
                         vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl);
 
@@ -431,7 +431,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m4(n);
+                    size_t vl = vsetvl_e16m4(n);
                     vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
 
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
@@ -468,7 +468,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                 vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl);
                 vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl);
@@ -489,7 +489,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                 vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
 
@@ -520,7 +520,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
 
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n);
+                        size_t vl = vsetvl_e16m4(n);
                         vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                         vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl);
 
@@ -540,7 +540,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
 
@@ -575,7 +575,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                     const float* slope_ptr = (const float*)slope_data + q * elempack;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n1);
+                        size_t vl = vsetvl_e16m4(n1);
                         vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                         vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(slope_ptr, vl), vl);
 
@@ -596,7 +596,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                 float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
 
                     vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
diff --git a/src/layer/riscv/relu_riscv.cpp b/src/layer/riscv/relu_riscv.cpp
index 6b23ebc3a63..cf2d4057069 100644
--- a/src/layer/riscv/relu_riscv.cpp
+++ b/src/layer/riscv/relu_riscv.cpp
@@ -58,10 +58,10 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
-                _p = vfmax_vf_f32m8(_p, (float32_t)0.f, vl);
+                _p = vfmax_vf_f32m8(_p, 0.f, vl);
                 vse32_v_f32m8(ptr, _p, vl);
 
                 ptr += vl;
@@ -82,7 +82,7 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 _p = vfmul_vf_f32m8_m(vmflt_vf_f32m8_b4(_p, .0f, vl), _p, _p, slope, vl); //slope: float(float32_t)
@@ -124,10 +124,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
 
                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
-                _p = vfmax_vf_f16m8(_p, (float16_t)0.f, vl);
+                _p = vfmax_vf_f16m8(_p, (__fp16)0.f, vl);
                 vse16_v_f16m8(ptr, _p, vl);
 
                 ptr += vl;
@@ -137,10 +137,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         else
         {
             int n = size;
-            float16_t _slope = (float16_t)slope;
+            __fp16 _slope = (__fp16)slope;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
 
                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                 _p = vfmul_vf_f16m8_m(vmflt_vf_f16m8_b2(_p, .0f, vl), _p, _p, _slope, vl);
diff --git a/src/layer/riscv/riscv_activation.h b/src/layer/riscv/riscv_activation.h
index 763e719b15d..d5f114f3aaa 100644
--- a/src/layer/riscv/riscv_activation.h
+++ b/src/layer/riscv/riscv_activation.h
@@ -22,49 +22,49 @@
 #include "rvv_mathfun.h"
 #include "rvv_mathfun_fp16s.h"
 
-#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN)                                                                                                            \
-    static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, word_type vl) \
-    {                                                                                                                                                        \
-        if (activation_type == 1)                                                                                                                            \
-        {                                                                                                                                                    \
-            _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl);                                                                                                      \
-        }                                                                                                                                                    \
-        else if (activation_type == 2)                                                                                                                       \
-        {                                                                                                                                                    \
-            vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl);                                                                       \
-            _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl);                                                                    \
-        }                                                                                                                                                    \
-        else if (activation_type == 3)                                                                                                                       \
-        {                                                                                                                                                    \
-            _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl);                                                                                     \
-            _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl);                                                                                     \
-        }                                                                                                                                                    \
-        else if (activation_type == 4)                                                                                                                       \
-        {                                                                                                                                                    \
-            _v = sigmoid_ps(_v, vl);                                                                                                                         \
-        }                                                                                                                                                    \
-        else if (activation_type == 5)                                                                                                                       \
-        {                                                                                                                                                    \
-            _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl);                               \
-        }                                                                                                                                                    \
-        else if (activation_type == 6)                                                                                                                       \
-        {                                                                                                                                                    \
-            const float alpha = activation_params[0];                                                                                                        \
-            const float beta = activation_params[1];                                                                                                         \
-            const float lower = -beta / alpha;                                                                                                               \
-            const float upper = (1.f / alpha) + lower;                                                                                                       \
-            vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl);                                                                      \
-            vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl);                                                                     \
-            vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl);                                                                                  \
-            _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl);                                                                                           \
-                                                                                                                                                             \
-            vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m(                                                                                     \
-                _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta,                                                           \
-                vl);                                                                                                                                         \
-            _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl);                                                                              \
-        }                                                                                                                                                    \
-                                                                                                                                                             \
-        return _v;                                                                                                                                           \
+#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN)                                                                                                         \
+    static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \
+    {                                                                                                                                                     \
+        if (activation_type == 1)                                                                                                                         \
+        {                                                                                                                                                 \
+            _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl);                                                                                                   \
+        }                                                                                                                                                 \
+        else if (activation_type == 2)                                                                                                                    \
+        {                                                                                                                                                 \
+            vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl);                                                                    \
+            _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl);                                                                 \
+        }                                                                                                                                                 \
+        else if (activation_type == 3)                                                                                                                    \
+        {                                                                                                                                                 \
+            _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl);                                                                                  \
+            _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl);                                                                                  \
+        }                                                                                                                                                 \
+        else if (activation_type == 4)                                                                                                                    \
+        {                                                                                                                                                 \
+            _v = sigmoid_ps(_v, vl);                                                                                                                      \
+        }                                                                                                                                                 \
+        else if (activation_type == 5)                                                                                                                    \
+        {                                                                                                                                                 \
+            _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl);                            \
+        }                                                                                                                                                 \
+        else if (activation_type == 6)                                                                                                                    \
+        {                                                                                                                                                 \
+            const float alpha = activation_params[0];                                                                                                     \
+            const float beta = activation_params[1];                                                                                                      \
+            const float lower = -beta / alpha;                                                                                                            \
+            const float upper = (1.f / alpha) + lower;                                                                                                    \
+            vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl);                                                                   \
+            vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl);                                                                  \
+            vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl);                                                                               \
+            _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl);                                                                                        \
+                                                                                                                                                          \
+            vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m(                                                                                  \
+                _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta,                                                        \
+                vl);                                                                                                                                      \
+            _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl);                                                                           \
+        }                                                                                                                                                 \
+                                                                                                                                                          \
+        return _v;                                                                                                                                        \
     }
 
 _RVV_FLOAT_ACTIVATION_PS(16, 1, 16)
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index f60faad50f7..596bf4435c6 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -53,7 +53,7 @@ static inline int csrr_vlenb()
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m8(packn * 8);
+    const size_t vl = vsetvl_e32m8(packn * 8);
 
     // NOTE vloxei8_v_f32m8 gets illegal instruction on d1  --- nihui
 
@@ -90,7 +90,7 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m8(packn * 8);
+    const size_t vl = vsetvl_e16m8(packn * 8);
 
     // NOTE vloxei8_v_f16m8 gets illegal instruction on d1  --- nihui
 
@@ -125,4 +125,278 @@ static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 #endif // __riscv_zfh
 #endif // __riscv_vector
 
+#if __riscv_vector && __rvv_tuple
+
+// f32m1, vsseg.v
+static inline void vsseg8e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
+{
+    vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vsseg8e32_v_f32m1x8(base, _tmp, vl);
+}
+
+static inline void vsseg4e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
+{
+    vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3);
+    vsseg4e32_v_f32m1x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
+{
+    vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1);
+    vsseg2e32_v_f32m1x2(base, _tmp, vl);
+}
+
+// f32m1, vssseg.v, 8/4/2
+static inline void vssseg8e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
+{
+    vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vssseg8e32_v_f32m1x8(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg4e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
+{
+    vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3);
+    vssseg4e32_v_f32m1x4(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg2e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
+{
+    vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1);
+    vssseg2e32_v_f32m1x2(base, bstride, _tmp, vl);
+}
+
+// f32m2, vsseg.v, 4/2
+static inline void vsseg4e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl)
+{
+    vfloat32m2x4_t _tmp = vcreate_f32m2x4(v0, v1, v2, v3);
+    vsseg4e32_v_f32m2x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl)
+{
+    vfloat32m2x2_t _tmp = vcreate_f32m2x2(v0, v1);
+    vsseg2e32_v_f32m2x2(base, _tmp, vl);
+}
+
+// u16m1, vsseg.v, 8/4
+static inline void vsseg8e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5, vuint16m1_t v6, vuint16m1_t v7, size_t vl)
+{
+    vuint16m1x8_t _tmp = vcreate_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vsseg8e16_v_u16m1x8(base, _tmp, vl);
+}
+
+static inline void vsseg4e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, size_t vl)
+{
+    vuint16m1x4_t _tmp = vcreate_u16m1x4(v0, v1, v2, v3);
+    vsseg4e16_v_u16m1x4(base, _tmp, vl);
+}
+
+// u16m2, vsseg.v, 4/2
+static inline void vsseg4e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t v2, vuint16m2_t v3, size_t vl)
+{
+    vuint16m2x4_t _tmp = vcreate_u16m2x4(v0, v1, v2, v3);
+    vsseg4e16_v_u16m2x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, size_t vl)
+{
+    vuint16m2x2_t _tmp = vcreate_u16m2x2(v0, v1);
+    vsseg2e16_v_u16m2x2(base, _tmp, vl);
+}
+
+// f32m1, vlseg.v 8/4/2
+static inline void vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float32_t* base, size_t vl)
+{
+    vfloat32m1x8_t _tmp = vlseg8e32_v_f32m1x8(base, vl);
+    *v0 = vget_f32m1x8_f32m1(_tmp, 0);
+    *v1 = vget_f32m1x8_f32m1(_tmp, 1);
+    *v2 = vget_f32m1x8_f32m1(_tmp, 2);
+    *v3 = vget_f32m1x8_f32m1(_tmp, 3);
+    *v4 = vget_f32m1x8_f32m1(_tmp, 4);
+    *v5 = vget_f32m1x8_f32m1(_tmp, 5);
+    *v6 = vget_f32m1x8_f32m1(_tmp, 6);
+    *v7 = vget_f32m1x8_f32m1(_tmp, 7);
+}
+
+static inline void vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float32_t* base, size_t vl)
+{
+    vfloat32m1x4_t _tmp = vlseg4e32_v_f32m1x4(base, vl);
+    *v0 = vget_f32m1x4_f32m1(_tmp, 0);
+    *v1 = vget_f32m1x4_f32m1(_tmp, 1);
+    *v2 = vget_f32m1x4_f32m1(_tmp, 2);
+    *v3 = vget_f32m1x4_f32m1(_tmp, 3);
+}
+
+static inline void vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float32_t* base, size_t vl)
+{
+    vfloat32m1x2_t _tmp = vlseg2e32_v_f32m1x2(base, vl);
+    *v0 = vget_f32m1x2_f32m1(_tmp, 0);
+    *v1 = vget_f32m1x2_f32m1(_tmp, 1);
+}
+
+// f32m2, vlseg.v, 4
+static inline void vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float32_t* base, size_t vl)
+{
+    vfloat32m2x4_t _tmp = vlseg4e32_v_f32m2x4(base, vl);
+    *v0 = vget_f32m2x4_f32m2(_tmp, 0);
+    *v1 = vget_f32m2x4_f32m2(_tmp, 1);
+    *v2 = vget_f32m2x4_f32m2(_tmp, 2);
+    *v3 = vget_f32m2x4_f32m2(_tmp, 3);
+}
+
+// f32m4, vlseg.v, 2
+static inline void vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, size_t vl)
+{
+    vfloat32m4x2_t _tmp = vlseg2e32_v_f32m4x2(base, vl);
+    *v0 = vget_f32m4x2_f32m4(_tmp, 0);
+    *v1 = vget_f32m4x2_f32m4(_tmp, 1);
+}
+
+// f32m4, vloxseg.v
+static inline void vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, vuint32m4_t bindex, size_t vl)
+{
+    vfloat32m4x2_t _tmp = vloxseg2ei32_v_f32m4x2(base, bindex, vl);
+    *v0 = vget_f32m4x2_f32m4(_tmp, 0);
+    *v1 = vget_f32m4x2_f32m4(_tmp, 1);
+}
+
+// u16m1, vlseg.v 8/4/2
+static inline void vlseg8e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, vuint16m1_t* v4, vuint16m1_t* v5, vuint16m1_t* v6, vuint16m1_t* v7, const uint16_t* base, size_t vl)
+{
+    vuint16m1x8_t _tmp = vlseg8e16_v_u16m1x8(base, vl);
+    *v0 = vget_u16m1x8_u16m1(_tmp, 0);
+    *v1 = vget_u16m1x8_u16m1(_tmp, 1);
+    *v2 = vget_u16m1x8_u16m1(_tmp, 2);
+    *v3 = vget_u16m1x8_u16m1(_tmp, 3);
+    *v4 = vget_u16m1x8_u16m1(_tmp, 4);
+    *v5 = vget_u16m1x8_u16m1(_tmp, 5);
+    *v6 = vget_u16m1x8_u16m1(_tmp, 6);
+    *v7 = vget_u16m1x8_u16m1(_tmp, 7);
+}
+
+static inline void vlseg4e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, const uint16_t* base, size_t vl)
+{
+    vuint16m1x4_t _tmp = vlseg4e16_v_u16m1x4(base, vl);
+    *v0 = vget_u16m1x4_u16m1(_tmp, 0);
+    *v1 = vget_u16m1x4_u16m1(_tmp, 1);
+    *v2 = vget_u16m1x4_u16m1(_tmp, 2);
+    *v3 = vget_u16m1x4_u16m1(_tmp, 3);
+}
+
+static inline void vlseg2e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, const uint16_t* base, size_t vl)
+{
+    vuint16m1x2_t _tmp = vlseg2e16_v_u16m1x2(base, vl);
+    *v0 = vget_u16m1x2_u16m1(_tmp, 0);
+    *v1 = vget_u16m1x2_u16m1(_tmp, 1);
+}
+
+// u16m2, vlseg.v, 4
+static inline void vlseg4e16_v_u16m2(vuint16m2_t* v0, vuint16m2_t* v1, vuint16m2_t* v2, vuint16m2_t* v3, const uint16_t* base, size_t vl)
+{
+    vuint16m2x4_t _tmp = vlseg4e16_v_u16m2x4(base, vl);
+    *v0 = vget_u16m2x4_u16m2(_tmp, 0);
+    *v1 = vget_u16m2x4_u16m2(_tmp, 1);
+    *v2 = vget_u16m2x4_u16m2(_tmp, 2);
+    *v3 = vget_u16m2x4_u16m2(_tmp, 3);
+}
+
+// u16m4, vlseg.v, 2
+static inline void vlseg2e16_v_u16m4(vuint16m4_t* v0, vuint16m4_t* v1, const uint16_t* base, size_t vl)
+{
+    vuint16m4x2_t _tmp = vlseg2e16_v_u16m4x2(base, vl);
+    *v0 = vget_u16m4x2_u16m4(_tmp, 0);
+    *v1 = vget_u16m4x2_u16m4(_tmp, 1);
+}
+
+#if __riscv_zfh
+
+// f16m1, vsseg.v, 8/4/2
+static inline void vsseg8e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl)
+{
+    vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vsseg8e16_v_f16m1x8(base, _tmp, vl);
+}
+
+static inline void vsseg4e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl)
+{
+    vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3);
+    vsseg4e16_v_f16m1x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl)
+{
+    vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1);
+    vsseg2e16_v_f16m1x2(base, _tmp, vl);
+}
+
+// f16m1, vssseg.v, 8/4/2
+static inline void vssseg8e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl)
+{
+    vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vssseg8e16_v_f16m1x8(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg4e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl)
+{
+    vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3);
+    vssseg4e16_v_f16m1x4(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg2e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl)
+{
+    vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1);
+    vssseg2e16_v_f16m1x2(base, bstride, _tmp, vl);
+}
+
+// f16m1, vlseg.v 8/4/2
+static inline void vlseg8e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, vfloat16m1_t* v4, vfloat16m1_t* v5, vfloat16m1_t* v6, vfloat16m1_t* v7, const float16_t* base, size_t vl)
+{
+    vfloat16m1x8_t _tmp = vlseg8e16_v_f16m1x8(base, vl);
+    *v0 = vget_f16m1x8_f16m1(_tmp, 0);
+    *v1 = vget_f16m1x8_f16m1(_tmp, 1);
+    *v2 = vget_f16m1x8_f16m1(_tmp, 2);
+    *v3 = vget_f16m1x8_f16m1(_tmp, 3);
+    *v4 = vget_f16m1x8_f16m1(_tmp, 4);
+    *v5 = vget_f16m1x8_f16m1(_tmp, 5);
+    *v6 = vget_f16m1x8_f16m1(_tmp, 6);
+    *v7 = vget_f16m1x8_f16m1(_tmp, 7);
+}
+
+static inline void vlseg4e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, const float16_t* base, size_t vl)
+{
+    vfloat16m1x4_t _tmp = vlseg4e16_v_f16m1x4(base, vl);
+    *v0 = vget_f16m1x4_f16m1(_tmp, 0);
+    *v1 = vget_f16m1x4_f16m1(_tmp, 1);
+    *v2 = vget_f16m1x4_f16m1(_tmp, 2);
+    *v3 = vget_f16m1x4_f16m1(_tmp, 3);
+}
+
+static inline void vlseg2e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, const float16_t* base, size_t vl)
+{
+    vfloat16m1x2_t _tmp = vlseg2e16_v_f16m1x2(base, vl);
+    *v0 = vget_f16m1x2_f16m1(_tmp, 0);
+    *v1 = vget_f16m1x2_f16m1(_tmp, 1);
+}
+
+// f16m2, vlseg.v, 4
+static inline void vlseg4e16_v_f16m2(vfloat16m2_t* v0, vfloat16m2_t* v1, vfloat16m2_t* v2, vfloat16m2_t* v3, const float16_t* base, size_t vl)
+{
+    vfloat16m2x4_t _tmp = vlseg4e16_v_f16m2x4(base, vl);
+    *v0 = vget_f16m2x4_f16m2(_tmp, 0);
+    *v1 = vget_f16m2x4_f16m2(_tmp, 1);
+    *v2 = vget_f16m2x4_f16m2(_tmp, 2);
+    *v3 = vget_f16m2x4_f16m2(_tmp, 3);
+}
+
+// f16m4, vlseg.v, 2
+static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const float16_t* base, size_t vl)
+{
+    vfloat16m4x2_t _tmp = vlseg2e16_v_f16m4x2(base, vl);
+    *v0 = vget_f16m4x2_f16m4(_tmp, 0);
+    *v1 = vget_f16m4x2_f16m4(_tmp, 1);
+}
+
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
 #endif // RISCV_USABILITY_H
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 8993b5ad8e6..aa966de6c86 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -32,7 +32,7 @@
 #define c_cephes_log_q2 0.693359375
 
 #define _RVV_FLOAT32_LOG_OP(LMUL, MLEN)                                                                              \
-    static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, word_type vl)                                    \
+    static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl)                                       \
     {                                                                                                                \
         x = vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */                            \
         vbool##MLEN##_t invalid_mask = vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl);                                    \
@@ -118,7 +118,7 @@ _RVV_FLOAT32_LOG_OP(8, 4)
 #define c_cephes_exp_p5 5.0000001201E-1
 
 #define _RVV_FLOAT32_EXP_OP(LMUL, MLEN)                                                   \
-    static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, word_type vl)         \
+    static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl)            \
     {                                                                                     \
         vfloat32m##LMUL##_t tmp, fx;                                                      \
                                                                                           \
@@ -184,7 +184,7 @@ _RVV_FLOAT32_EXP_OP(8, 4)
 #define c_cephes_FOPI      1.27323954473516 // 4 / M_PI
 
 #define _RVV_FLOAT32_SINCOS_OP(LMUL, MLEN)                                                                                          \
-    static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, word_type vl)         \
+    static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, size_t vl)            \
     {                                                                                                                               \
         /* any x */                                                                                                                 \
         vfloat32m##LMUL##_t xmm1, xmm2, xmm3, y;                                                                                    \
@@ -257,12 +257,12 @@ _RVV_FLOAT32_SINCOS_OP(2, 16)
 _RVV_FLOAT32_SINCOS_OP(4, 8)
 _RVV_FLOAT32_SINCOS_OP(8, 4)
 
-#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN)                                           \
-    static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat32m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ysin;                                                              \
+#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN)                                        \
+    static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat32m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ysin;                                                           \
     }
 
 _RVV_FLOAT32_SIN_OP(1, 32)
@@ -270,12 +270,12 @@ _RVV_FLOAT32_SIN_OP(2, 16)
 _RVV_FLOAT32_SIN_OP(4, 8)
 _RVV_FLOAT32_SIN_OP(8, 4)
 
-#define _RVV_FLOAT32_COS_OP(LMUL, MLEN)                                           \
-    static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat32m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ycos;                                                              \
+#define _RVV_FLOAT32_COS_OP(LMUL, MLEN)                                        \
+    static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat32m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ycos;                                                           \
     }
 
 _RVV_FLOAT32_COS_OP(1, 32)
@@ -293,7 +293,7 @@ _RVV_FLOAT32_COS_OP(8, 4)
 #define c_cephes_tanh_p4 -3.33332819422E-1
 
 #define _RVV_FLOAT32_TANH_OP(LMUL, MLEN)                                                                                              \
-    static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, word_type vl)                                                    \
+    static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, size_t vl)                                                       \
     {                                                                                                                                 \
         vfloat32m##LMUL##_t x2 = vfsgnj_vf_f32m##LMUL(x, 1.f, vl);                                                                    \
                                                                                                                                       \
@@ -341,11 +341,11 @@ _RVV_FLOAT32_TANH_OP(2, 16)
 _RVV_FLOAT32_TANH_OP(4, 8)
 _RVV_FLOAT32_TANH_OP(8, 4)
 
-#define _RVV_FLOAT32_POW_OP(LMUL, MLEN)                                                                  \
-    static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, word_type vl) \
-    {                                                                                                    \
-        /* pow(x, m) = exp(m * log(x)) */                                                                \
-        return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl);                                    \
+#define _RVV_FLOAT32_POW_OP(LMUL, MLEN)                                                               \
+    static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \
+    {                                                                                                 \
+        /* pow(x, m) = exp(m * log(x)) */                                                             \
+        return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl);                                 \
     }
 
 _RVV_FLOAT32_POW_OP(1, 32)
@@ -354,7 +354,7 @@ _RVV_FLOAT32_POW_OP(4, 8)
 _RVV_FLOAT32_POW_OP(8, 4)
 
 #define _RVV_FLOAT32_SIGMOID_OP(LMUL, MLEN)                                                                                                \
-    static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, word_type vl)                                                     \
+    static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, size_t vl)                                                        \
     {                                                                                                                                      \
         _v = vfneg_v_f32m##LMUL(_v, vl);                                                                                                   \
         _v = exp_ps(_v, vl);                                                                                                               \
@@ -447,8 +447,8 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4)
 #define c_erfc_sb7 -2.2440952301e+01f /* 0xc1b38712 */
 
 #define _RVV_FLOAT32_FMA_HELPER(LMUL)                                                                     \
-    static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float32_t b,           \
-                                                            float32_t c, word_type vl)                    \
+    static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float b,               \
+                                                            float c, size_t vl)                           \
     {                                                                                                     \
         vfloat32m##LMUL##_t ret = vfmul_vf_f32m##LMUL(a, b, vl);                                          \
         ret = vfadd_vf_f32m##LMUL(ret, c, vl);                                                            \
@@ -456,7 +456,7 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4)
     }                                                                                                     \
                                                                                                           \
     static inline vfloat32m##LMUL##_t vfmadd_vvf_f32m##LMUL(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, \
-                                                            float32_t c, word_type vl)                    \
+                                                            float c, size_t vl)                           \
     {                                                                                                     \
         vfloat32m##LMUL##_t ret = vfmul_vv_f32m##LMUL(a, b, vl);                                          \
         ret = vfadd_vf_f32m##LMUL(ret, c, vl);                                                            \
@@ -469,7 +469,7 @@ _RVV_FLOAT32_FMA_HELPER(2)
 _RVV_FLOAT32_FMA_HELPER(1)
 
 #define _RVV_FLOAT32_ERFC_OP(LMUL, MLEN)                                                                                                                                                                                                                                                                                                           \
-    static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, word_type vl)                                                                                                                                                                                                                                                                 \
+    static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, size_t vl)                                                                                                                                                                                                                                                                    \
     {                                                                                                                                                                                                                                                                                                                                              \
         /* Argument for polys */                                                                                                                                                                                                                                                                                                                   \
         vfloat32m##LMUL##_t absx = vfsgnjx_vv_f32m##LMUL(x, x, vl);                                                                                                                                                                                                                                                                                \
diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
index 129a4f94037..e7f18b961ae 100644
--- a/src/layer/riscv/rvv_mathfun_fp16s.h
+++ b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -32,7 +32,7 @@
 #define c_cephes_log_q2     0.693359375
 
 #define _RVV_FLOAT16_LOG_OP(LMUL, MLEN)                                                                          \
-    static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, word_type vl)                                \
+    static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, size_t vl)                                   \
     {                                                                                                            \
         x = vfmax_vf_f16m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */                        \
         vbool##MLEN##_t invalid_mask = vmfle_vf_f16m##LMUL##_b##MLEN(x, 0.f, vl);                                \
@@ -118,7 +118,7 @@ _RVV_FLOAT16_LOG_OP(8, 2)
 #define c_cephes_exp_p5 5.0000001201E-1
 
 #define _RVV_FLOAT16_EXP_OP(LMUL, MLEN)                                                   \
-    static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, word_type vl)         \
+    static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl)            \
     {                                                                                     \
         vfloat16m##LMUL##_t tmp, fx;                                                      \
                                                                                           \
@@ -184,7 +184,7 @@ _RVV_FLOAT16_EXP_OP(8, 2)
 #define c_cephes_FOPI      1.27323954473516 // 4 / M_PI
 
 #define _RVV_FLOAT16_SINCOS_OP(LMUL, MLEN)                                                                                          \
-    static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, word_type vl)         \
+    static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, size_t vl)            \
     {                                                                                                                               \
         /* any x */                                                                                                                 \
         vfloat16m##LMUL##_t xmm1, xmm2, xmm3, y;                                                                                    \
@@ -257,12 +257,12 @@ _RVV_FLOAT16_SINCOS_OP(2, 8)
 _RVV_FLOAT16_SINCOS_OP(4, 4)
 _RVV_FLOAT16_SINCOS_OP(8, 2)
 
-#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN)                                           \
-    static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat16m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ysin;                                                              \
+#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN)                                        \
+    static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat16m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ysin;                                                           \
     }
 
 _RVV_FLOAT16_SIN_OP(1, 16)
@@ -270,12 +270,12 @@ _RVV_FLOAT16_SIN_OP(2, 8)
 _RVV_FLOAT16_SIN_OP(4, 4)
 _RVV_FLOAT16_SIN_OP(8, 2)
 
-#define _RVV_FLOAT16_COS_OP(LMUL, MLEN)                                           \
-    static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat16m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ycos;                                                              \
+#define _RVV_FLOAT16_COS_OP(LMUL, MLEN)                                        \
+    static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat16m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ycos;                                                           \
     }
 
 _RVV_FLOAT16_COS_OP(1, 16)
@@ -293,7 +293,7 @@ _RVV_FLOAT16_COS_OP(8, 2)
 #define c_cephes_tanh_p4 -3.33332819422E-1
 
 #define _RVV_FLOAT16_TANH_OP(LMUL, MLEN)                                                                                              \
-    static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, word_type vl)                                                    \
+    static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, size_t vl)                                                       \
     {                                                                                                                                 \
         vfloat16m##LMUL##_t x2 = vfsgnj_vf_f16m##LMUL(x, 1.f, vl);                                                                    \
                                                                                                                                       \
@@ -341,11 +341,11 @@ _RVV_FLOAT16_TANH_OP(2, 8)
 _RVV_FLOAT16_TANH_OP(4, 4)
 _RVV_FLOAT16_TANH_OP(8, 2)
 
-#define _RVV_FLOAT16_POW_OP(LMUL, MLEN)                                                                  \
-    static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, word_type vl) \
-    {                                                                                                    \
-        /* pow(x, m) = exp(m * log(x)) */                                                                \
-        return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl);                                    \
+#define _RVV_FLOAT16_POW_OP(LMUL, MLEN)                                                               \
+    static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, size_t vl) \
+    {                                                                                                 \
+        /* pow(x, m) = exp(m * log(x)) */                                                             \
+        return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl);                                 \
     }
 
 _RVV_FLOAT16_POW_OP(1, 16)
@@ -354,7 +354,7 @@ _RVV_FLOAT16_POW_OP(4, 4)
 _RVV_FLOAT16_POW_OP(8, 2)
 
 #define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN)                                                                                                \
-    static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, word_type vl)                                                     \
+    static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl)                                                        \
     {                                                                                                                                      \
         _v = vfneg_v_f16m##LMUL(_v, vl);                                                                                                   \
         _v = exp_ps(_v, vl);                                                                                                               \
diff --git a/src/layer/riscv/selu_riscv.cpp b/src/layer/riscv/selu_riscv.cpp
index 9a4939c8421..932db355cc2 100644
--- a/src/layer/riscv/selu_riscv.cpp
+++ b/src/layer/riscv/selu_riscv.cpp
@@ -39,7 +39,7 @@ int SELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vbool4_t _lower = vmflt_vf_f32m8_b4(_p, 0.f, vl);
             vbool4_t _higher = vmnot_m_b4(_lower, vl);
diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp
index afd07ea2b38..6c10582c668 100644
--- a/src/layer/riscv/sigmoid_riscv.cpp
+++ b/src/layer/riscv/sigmoid_riscv.cpp
@@ -64,7 +64,7 @@ int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = sigmoid_ps(_p, vl);
@@ -104,7 +104,7 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = sigmoid_ps(_p, vl);
@@ -135,7 +135,7 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = sigmoid_ps(_p, vl);
diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp
index 7a93e5de18d..ca910c3d3c0 100644
--- a/src/layer/riscv/softmax_riscv.cpp
+++ b/src/layer/riscv/softmax_riscv.cpp
@@ -44,7 +44,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         float* ptr_vol = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
             vfloat32m1_t _max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl);
@@ -61,7 +61,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         ptr_vol = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
 
@@ -80,7 +80,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         ptr_vol = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
             _p = vfdiv_vf_f32m8(_p, sum, vl);
@@ -112,7 +112,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = w * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
@@ -141,7 +141,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
 
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
@@ -168,7 +168,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = w * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
 
@@ -198,7 +198,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             float* ptr1 = ptr;
             while (n1 > 0)
             {
-                word_type vl = vsetvl_e32m8(n1);
+                size_t vl = vsetvl_e32m8(n1);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr1, vl);
                 vfloat32m1_t _m = vfmv_s_f_f32m1(vundefined_f32m1(), m, vl);
 
@@ -215,7 +215,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             float* ptr2 = ptr;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e32m8(n2);
+                size_t vl = vsetvl_e32m8(n2);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr2, vl);
                 vfloat32m1_t _s = vfmv_s_f_f32m1(vundefined_f32m1(), s, vl);
 
@@ -233,7 +233,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             float* ptr3 = ptr;
             while (n3 > 0)
             {
-                word_type vl = vsetvl_e32m8(n3);
+                size_t vl = vsetvl_e32m8(n3);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr3, vl);
 
@@ -269,7 +269,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = size * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _max = vle32_v_f32m8(max, vl);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
@@ -295,7 +295,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = size * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
                 vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
@@ -319,7 +319,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = size * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
 
@@ -358,7 +358,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
@@ -392,7 +392,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
 
                 while (n)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl);
                     vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl);
@@ -422,7 +422,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl);
 
@@ -457,7 +457,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 float* ptr_1 = ptr;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n1);
+                    size_t vl = vsetvl_e32m8(n1);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr_1, vl);
                     vfloat32m1_t _scalar_max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl);
                     _scalar_max = vfredmax_vs_f32m8_f32m1(_scalar_max, _p, _scalar_max, vl);
@@ -473,7 +473,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 float* ptr_2 = ptr;
                 while (n2 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n2);
+                    size_t vl = vsetvl_e32m8(n2);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr_2, vl);
                     vfloat32m1_t _scalar_sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
 
@@ -491,7 +491,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 float* ptr_3 = ptr;
                 while (n3 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n3);
+                    size_t vl = vsetvl_e32m8(n3);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr_3, vl);
 
                     _p = vfdiv_vf_f32m8(_p, sum, vl);
diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp
index f12ab157ae9..17493d7db69 100644
--- a/src/layer/riscv/swish_riscv.cpp
+++ b/src/layer/riscv/swish_riscv.cpp
@@ -64,7 +64,7 @@ int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl);
@@ -103,7 +103,7 @@ int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl);
@@ -134,7 +134,7 @@ int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfdiv_vv_f16m8(_p, vfadd_vf_f16m8(exp_ps(vfneg_v_f16m8(_p, vl), vl), 1.f, vl), vl);
diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp
index b0f0cafe7d7..d47de61dc59 100644
--- a/src/layer/riscv/tanh_riscv.cpp
+++ b/src/layer/riscv/tanh_riscv.cpp
@@ -64,7 +64,7 @@ int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = tanh_ps(_p, vl);
@@ -103,7 +103,7 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = tanh_ps(_p, vl);
@@ -134,7 +134,7 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = tanh_ps(_p, vl);
diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp
index 62c6a52740b..e5eb80151b1 100644
--- a/src/layer/riscv/unaryop_riscv.cpp
+++ b/src/layer/riscv/unaryop_riscv.cpp
@@ -55,7 +55,7 @@ static int unary_op_inplace(Mat& a, const Option& opt)
         int n = size * elempack;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = op(_p, vl);
@@ -73,7 +73,7 @@ namespace UnaryOp_riscv_functor {
 
 struct unary_op_abs
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfsgnj_vf_f32m8(x, 1.f, vl);
     }
@@ -81,7 +81,7 @@ struct unary_op_abs
 
 struct unary_op_neg
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfneg_v_f32m8(x, vl);
     }
@@ -89,7 +89,7 @@ struct unary_op_neg
 
 struct unary_op_floor
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
         vbool4_t _mask = vmfgt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
@@ -99,7 +99,7 @@ struct unary_op_floor
 
 struct unary_op_ceil
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
         vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
@@ -109,7 +109,7 @@ struct unary_op_ceil
 
 struct unary_op_square
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfmul_vv_f32m8(x, x, vl);
     }
@@ -117,7 +117,7 @@ struct unary_op_square
 
 struct unary_op_sqrt
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfsqrt_v_f32m8(x, vl);
     }
@@ -125,7 +125,7 @@ struct unary_op_sqrt
 
 struct unary_op_rsqrt
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vfloat32m8_t _reciprocal = vfrsqrt7_v_f32m8(x, vl);
         _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
@@ -136,7 +136,7 @@ struct unary_op_rsqrt
 
 struct unary_op_exp
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return exp_ps(x, vl);
     }
@@ -144,7 +144,7 @@ struct unary_op_exp
 
 struct unary_op_log
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return log_ps(x, vl);
     }
@@ -152,7 +152,7 @@ struct unary_op_log
 
 struct unary_op_sin
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return sin_ps(x, vl);
     }
@@ -160,7 +160,7 @@ struct unary_op_sin
 
 struct unary_op_cos
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return cos_ps(x, vl);
     }
@@ -168,7 +168,7 @@ struct unary_op_cos
 
 struct unary_op_tan
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -183,7 +183,7 @@ struct unary_op_tan
 
 struct unary_op_asin
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -198,7 +198,7 @@ struct unary_op_asin
 
 struct unary_op_acos
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -213,7 +213,7 @@ struct unary_op_acos
 
 struct unary_op_atan
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -228,7 +228,7 @@ struct unary_op_atan
 
 struct unary_op_reciprocal
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vfloat32m8_t _reciprocal = vfrec7_v_f32m8(x, vl);
         _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
@@ -239,7 +239,7 @@ struct unary_op_reciprocal
 
 struct unary_op_tanh
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return tanh_ps(x, vl);
     }
@@ -338,7 +338,7 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
         int n = size * elempack;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = op(_p, vl);
@@ -356,7 +356,7 @@ namespace UnaryOp_riscv_functor {
 
 struct unary_op_abs_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfsgnj_vf_f16m8(x, 1.f, vl);
     }
@@ -364,7 +364,7 @@ struct unary_op_abs_fp16s
 
 struct unary_op_neg_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfneg_v_f16m8(x, vl);
     }
@@ -372,7 +372,7 @@ struct unary_op_neg_fp16s
 
 struct unary_op_floor_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
         vbool2_t _mask = vmfgt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
@@ -382,7 +382,7 @@ struct unary_op_floor_fp16s
 
 struct unary_op_ceil_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
         vbool2_t _mask = vmflt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
@@ -392,7 +392,7 @@ struct unary_op_ceil_fp16s
 
 struct unary_op_square_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfmul_vv_f16m8(x, x, vl);
     }
@@ -400,7 +400,7 @@ struct unary_op_square_fp16s
 
 struct unary_op_sqrt_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfsqrt_v_f16m8(x, vl);
     }
@@ -408,7 +408,7 @@ struct unary_op_sqrt_fp16s
 
 struct unary_op_rsqrt_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vfloat16m8_t _reciprocal = vfrsqrt7_v_f16m8(x, vl);
         _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
@@ -419,7 +419,7 @@ struct unary_op_rsqrt_fp16s
 
 struct unary_op_exp_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return exp_ps(x, vl);
     }
@@ -427,7 +427,7 @@ struct unary_op_exp_fp16s
 
 struct unary_op_log_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return log_ps(x, vl);
     }
@@ -435,7 +435,7 @@ struct unary_op_log_fp16s
 
 struct unary_op_sin_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return sin_ps(x, vl);
     }
@@ -443,7 +443,7 @@ struct unary_op_sin_fp16s
 
 struct unary_op_cos_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return cos_ps(x, vl);
     }
@@ -451,7 +451,7 @@ struct unary_op_cos_fp16s
 
 struct unary_op_tan_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -466,7 +466,7 @@ struct unary_op_tan_fp16s
 
 struct unary_op_asin_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -481,7 +481,7 @@ struct unary_op_asin_fp16s
 
 struct unary_op_acos_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -496,7 +496,7 @@ struct unary_op_acos_fp16s
 
 struct unary_op_atan_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -511,7 +511,7 @@ struct unary_op_atan_fp16s
 
 struct unary_op_reciprocal_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vfloat16m8_t _reciprocal = vfrec7_v_f16m8(x, vl);
         _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
@@ -522,7 +522,7 @@ struct unary_op_reciprocal_fp16s
 
 struct unary_op_tanh_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return tanh_ps(x, vl);
     }
diff --git a/src/layer/squeeze.cpp b/src/layer/squeeze.cpp
index 14840a8215e..7f9de8933cd 100644
--- a/src/layer/squeeze.cpp
+++ b/src/layer/squeeze.cpp
@@ -26,6 +26,7 @@ int Squeeze::load_param(const ParamDict& pd)
 {
     squeeze_w = pd.get(0, 0);
     squeeze_h = pd.get(1, 0);
+    squeeze_d = pd.get(11, 0);
     squeeze_c = pd.get(2, 0);
     axes = pd.get(3, Mat());
 
@@ -36,17 +37,20 @@ int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
 {
     int w = bottom_blob.w;
     int h = bottom_blob.h;
+    int d = bottom_blob.d;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
 
     bool _squeeze_w = false;
     bool _squeeze_h = false;
+    bool _squeeze_d = false;
     bool _squeeze_c = false;
 
     if (axes.empty())
     {
         _squeeze_w = w == 1 && squeeze_w;
         _squeeze_h = h == 1 && squeeze_h;
+        _squeeze_d = d == 1 && squeeze_d;
         _squeeze_c = channels == 1 && squeeze_c;
     }
     else
@@ -82,6 +86,22 @@ int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
             {
                 _squeeze_w = w == 1;
             }
+            if (dims == 4 && axis == 0)
+            {
+                _squeeze_c = channels == 1;
+            }
+            if (dims == 4 && axis == 1)
+            {
+                _squeeze_d = d == 1;
+            }
+            if (dims == 4 && axis == 2)
+            {
+                _squeeze_h = h == 1;
+            }
+            if (dims == 4 && axis == 3)
+            {
+                _squeeze_w = w == 1;
+            }
         }
     }
 
@@ -143,6 +163,70 @@ int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c
         }
     }
 
+    if (dims == 4)
+    {
+        if (_squeeze_w && _squeeze_h && _squeeze_d && _squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(1, opt.blob_allocator);
+        }
+        else if (_squeeze_w && _squeeze_h && _squeeze_d)
+        {
+            top_blob = bottom_blob.reshape(channels, opt.blob_allocator);
+        }
+        else if (_squeeze_h && _squeeze_d && _squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(w, opt.blob_allocator);
+        }
+        else if (_squeeze_w && _squeeze_d && _squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(h, opt.blob_allocator);
+        }
+        else if (_squeeze_w && _squeeze_h && _squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(d, opt.blob_allocator);
+        }
+        else if (_squeeze_w && _squeeze_h)
+        {
+            top_blob = bottom_blob.reshape(d, channels, opt.blob_allocator);
+        }
+        else if (_squeeze_w && _squeeze_d)
+        {
+            top_blob = bottom_blob.reshape(h, channels, opt.blob_allocator);
+        }
+        else if (_squeeze_h && _squeeze_d)
+        {
+            top_blob = bottom_blob.reshape(w, channels, opt.blob_allocator);
+        }
+        else if (_squeeze_h && _squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(w, d, opt.blob_allocator);
+        }
+        else if (_squeeze_w && _squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(h, d, opt.blob_allocator);
+        }
+        else if (_squeeze_d && _squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(w, h, opt.blob_allocator);
+        }
+        else if (_squeeze_w)
+        {
+            top_blob = bottom_blob.reshape(h, d, channels, opt.blob_allocator);
+        }
+        else if (_squeeze_h)
+        {
+            top_blob = bottom_blob.reshape(w, d, channels, opt.blob_allocator);
+        }
+        else if (_squeeze_d)
+        {
+            top_blob = bottom_blob.reshape(w, h, channels, opt.blob_allocator);
+        }
+        else if (_squeeze_c)
+        {
+            top_blob = bottom_blob.reshape(w, h, d, opt.blob_allocator);
+        }
+    }
+
     if (top_blob.empty())
         return -100;
 
diff --git a/src/layer/squeeze.h b/src/layer/squeeze.h
index cea5a413cc2..536a3b9769e 100644
--- a/src/layer/squeeze.h
+++ b/src/layer/squeeze.h
@@ -31,6 +31,7 @@ class Squeeze : public Layer
 public:
     int squeeze_w;
     int squeeze_h;
+    int squeeze_d;
     int squeeze_c;
     Mat axes;
 };
diff --git a/src/layer/unfold.cpp b/src/layer/unfold.cpp
new file mode 100644
index 00000000000..f747a169ced
--- /dev/null
+++ b/src/layer/unfold.cpp
@@ -0,0 +1,146 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "unfold.h"
+
+namespace ncnn {
+
+Unfold::Unfold()
+{
+    one_blob_only = true;
+}
+
+int Unfold::load_param(const ParamDict& pd)
+{
+    kernel_w = pd.get(1, 0);
+    kernel_h = pd.get(11, kernel_w);
+    dilation_w = pd.get(2, 1);
+    dilation_h = pd.get(12, dilation_w);
+    stride_w = pd.get(3, 1);
+    stride_h = pd.get(13, stride_w);
+    pad_left = pd.get(4, 0);
+    pad_right = pd.get(15, pad_left);
+    pad_top = pd.get(14, pad_left);
+    pad_bottom = pd.get(16, pad_top);
+    pad_value = pd.get(18, 0.f);
+
+    return 0;
+}
+
+int Unfold::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    Mat bottom_blob_bordered;
+    {
+        Option opt_b = opt;
+        opt_b.blob_allocator = opt.workspace_allocator;
+        opt_b.use_packing_layout = false;
+        make_padding(bottom_blob, bottom_blob_bordered, opt_b);
+        if (bottom_blob_bordered.empty())
+            return -100;
+    }
+
+    const int w = bottom_blob_bordered.w;
+    const int h = bottom_blob_bordered.h;
+    const int channels = bottom_blob_bordered.c;
+    const size_t elemsize = bottom_blob_bordered.elemsize;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+    const int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    top_blob.create(size, maxk * channels, elemsize, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // im2col
+    const int gap = w * stride_h - outw * stride_w;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const Mat img = bottom_blob_bordered.channel(p);
+        float* ptr = top_blob.row(p * maxk);
+
+        for (int u = 0; u < kernel_h; u++)
+        {
+            for (int v = 0; v < kernel_w; v++)
+            {
+                const float* sptr = img.row(dilation_h * u) + dilation_w * v;
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        ptr[0] = sptr[0];
+
+                        sptr += stride_w;
+                        ptr += 1;
+                    }
+
+                    sptr += gap;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+void Unfold::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    bottom_blob_bordered = bottom_blob;
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+    {
+        Option opt_b = opt;
+        opt_b.blob_allocator = opt.workspace_allocator;
+        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
+    }
+    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
+    {
+        // tensorflow padding=SAME or onnx padding=SAME_UPPER
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+        if (wpad > 0 || hpad > 0)
+        {
+            Option opt_b = opt;
+            opt_b.blob_allocator = opt.workspace_allocator;
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
+        }
+    }
+    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
+    {
+        // onnx padding=SAME_LOWER
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+        if (wpad > 0 || hpad > 0)
+        {
+            Option opt_b = opt;
+            opt_b.blob_allocator = opt.workspace_allocator;
+            copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
+        }
+    }
+}
+
+} // namespace ncnn
diff --git a/src/layer/unfold.h b/src/layer/unfold.h
new file mode 100644
index 00000000000..ff7860b7f72
--- /dev/null
+++ b/src/layer/unfold.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_UNFOLD_H
+#define LAYER_UNFOLD_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Unfold : public Layer
+{
+public:
+    Unfold();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;
+
+public:
+    int kernel_w;
+    int kernel_h;
+    int dilation_w;
+    int dilation_h;
+    int stride_w;
+    int stride_h;
+    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
+    int pad_right;
+    int pad_top;
+    int pad_bottom;
+    float pad_value;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_UNFOLD_H
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index 5a73695e7c1..22e817d34e6 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -794,7 +794,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         convert_packing(bias_data, bias_data_packed, out_elempack, opt);
     }
 
-    if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16)
+    if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
+    {
+        // pass
+    }
+    else if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16)
     {
         bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
 
@@ -872,7 +876,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         }
         pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
     }
-    if (is_conv1x1s1d1)
+    else if (is_conv1x1s1d1)
     {
         bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
 
@@ -1221,13 +1225,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
         bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
 
         bool pre_winograd43 = opt.use_winograd43_convolution;
-        if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24))))
-            pre_winograd43 = false;
-        if (vkdev->info.type() != 0 && (w <= 12 && h <= 12))
-            pre_winograd43 = false;
+        if (opt.use_winograd23_convolution)
+        {
+            if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24))))
+                pre_winograd43 = false;
+            if (vkdev->info.type() != 0 && (w <= 12 && h <= 12))
+                pre_winograd43 = false;
 
-        if (use_cooperative_matrix && (w <= 18 && h <= 18))
-            pre_winograd43 = false;
+            if (use_cooperative_matrix && (w <= 18 && h <= 18))
+                pre_winograd43 = false;
+        }
 
         if (pre_winograd43)
         {
@@ -1660,10 +1667,13 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
     if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16)
     {
         bool pre_winograd43 = opt.use_winograd43_convolution;
-        if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24))))
-            pre_winograd43 = false;
-        if (vkdev->info.type() != 0 && (w <= 12 && h <= 12))
-            pre_winograd43 = false;
+        if (opt.use_winograd23_convolution)
+        {
+            if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24))))
+                pre_winograd43 = false;
+            if (vkdev->info.type() != 0 && (w <= 12 && h <= 12))
+                pre_winograd43 = false;
+        }
 
         if (pre_winograd43)
         {
diff --git a/src/layer/vulkan/elu_vulkan.cpp b/src/layer/vulkan/elu_vulkan.cpp
new file mode 100644
index 00000000000..cf35b5b666d
--- /dev/null
+++ b/src/layer/vulkan/elu_vulkan.cpp
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "elu_vulkan.h"
+
+#include "layer_shader_type.h"
+
+namespace ncnn {
+
+ELU_vulkan::ELU_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_elu = 0;
+    pipeline_elu_pack4 = 0;
+    pipeline_elu_pack8 = 0;
+}
+
+int ELU_vulkan::create_pipeline(const Option& opt)
+{
+    const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Mat shape_packed;
+    if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    std::vector<vk_specialization_type> specializations(1 + 5);
+    specializations[0].f = alpha;
+    specializations[1 + 0].i = shape_packed.dims;
+    specializations[1 + 1].i = shape_packed.w;
+    specializations[1 + 2].i = shape_packed.h * shape_packed.d;
+    specializations[1 + 3].i = shape_packed.c;
+    specializations[1 + 4].i = shape_packed.cstep;
+
+    Mat local_size_xyz;
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, shape_packed.w);
+        local_size_xyz.h = std::min(8, shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+    if (shape_packed.dims == 4)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_elu = new Pipeline(vkdev);
+        pipeline_elu->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_elu->create(LayerShaderType::elu, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_elu_pack4 = new Pipeline(vkdev);
+        pipeline_elu_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_elu_pack4->create(LayerShaderType::elu_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_elu_pack8 = new Pipeline(vkdev);
+        pipeline_elu_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_elu_pack8->create(LayerShaderType::elu_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int ELU_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_elu;
+    pipeline_elu = 0;
+
+    delete pipeline_elu_pack4;
+    pipeline_elu_pack4 = 0;
+
+    delete pipeline_elu_pack8;
+    pipeline_elu_pack8 = 0;
+
+    return 0;
+}
+
+int ELU_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const
+{
+    int elempack = bottom_top_blob.elempack;
+
+    std::vector<VkMat> bindings(1);
+    bindings[0] = bottom_top_blob;
+
+    std::vector<vk_constant_type> constants(5);
+    constants[0].i = bottom_top_blob.dims;
+    constants[1].i = bottom_top_blob.w;
+    constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
+    constants[3].i = bottom_top_blob.c;
+    constants[4].i = bottom_top_blob.cstep;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_elu_pack8
+                               : elempack == 4 ? pipeline_elu_pack4
+                               : pipeline_elu;
+
+    cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+
+    return 0;
+}
+
+int ELU_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const
+{
+    int elempack = bottom_top_blob.elempack;
+
+    std::vector<VkImageMat> bindings(2);
+    bindings[0] = bottom_top_blob;
+    bindings[1] = bottom_top_blob;
+
+    std::vector<vk_constant_type> constants(5);
+    constants[0].i = bottom_top_blob.dims;
+    constants[1].i = bottom_top_blob.w;
+    constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
+    constants[3].i = bottom_top_blob.c;
+    constants[4].i = 0; //bottom_top_blob.cstep;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_elu_pack8
+                               : elempack == 4 ? pipeline_elu_pack4
+                               : pipeline_elu;
+
+    cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/elu_vulkan.h b/src/layer/vulkan/elu_vulkan.h
new file mode 100644
index 00000000000..62da80a00c5
--- /dev/null
+++ b/src/layer/vulkan/elu_vulkan.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ELU_VULKAN_H
+#define LAYER_ELU_VULKAN_H
+
+#include "elu.h"
+
+namespace ncnn {
+
+class ELU_vulkan : virtual public ELU
+{
+public:
+    ELU_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    using ELU::forward_inplace;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_elu;
+    Pipeline* pipeline_elu_pack4;
+    Pipeline* pipeline_elu_pack8;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELU_VULKAN_H
diff --git a/src/layer/vulkan/shader/elu.comp b/src/layer/vulkan/shader/elu.comp
new file mode 100644
index 00000000000..319606a012d
--- /dev/null
+++ b/src/layer/vulkan/shader/elu.comp
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float alpha = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afp v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afp v = buffer_ld1(bottom_top_blob_data, gi);
+#endif
+
+    v = v > afp(0.0f) ? v : afp(alpha * (exp(v) - 1.0f));
+
+#if NCNN_image_shader
+    image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+#else
+    buffer_st1(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/src/layer/vulkan/shader/elu_pack4.comp b/src/layer/vulkan/shader/elu_pack4.comp
new file mode 100644
index 00000000000..6d02f11bd4a
--- /dev/null
+++ b/src/layer/vulkan/shader/elu_pack4.comp
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float alpha = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
+#endif
+
+    v = mix(afpvec4(alpha) * afpvec4(exp(v) - afpvec4(1.0f)), v, greaterThan(v, afpvec4(0.0f)));
+
+#if NCNN_image_shader
+    image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+#else
+    buffer_st4(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/src/layer/vulkan/shader/elu_pack8.comp b/src/layer/vulkan/shader/elu_pack8.comp
new file mode 100644
index 00000000000..0b8831d61a4
--- /dev/null
+++ b/src/layer/vulkan/shader/elu_pack8.comp
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float alpha = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
+#endif
+
+    v[0] = mix(afpvec4(alpha) * afpvec4(exp(v[0]) - afpvec4(1.0f)), v[0], greaterThan(v[0], afpvec4(0.0f)));
+    v[1] = mix(afpvec4(alpha) * afpvec4(exp(v[1]) - afpvec4(1.0f)), v[1], greaterThan(v[1], afpvec4(0.0f)));
+
+#if NCNN_image_shader
+    image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+#else
+    buffer_st8(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h
index 2892e3d2bf7..0513d5e1be1 100644
--- a/src/layer/x86/avx512_mathfun.h
+++ b/src/layer/x86/avx512_mathfun.h
@@ -182,6 +182,48 @@ static NCNN_FORCEINLINE __m512 exp512_ps(__m512 x)
     return y;
 }
 
+_PS512_CONST(tanh_hi, 9.0f);
+_PS512_CONST(tanh_lo, -9.0f);
+
+_PS512_CONST(cephes_tanh_p0, -2.76076847742355E-16f);
+_PS512_CONST(cephes_tanh_p1, 2.00018790482477E-13f);
+_PS512_CONST(cephes_tanh_p2, -8.60467152213735E-11f);
+_PS512_CONST(cephes_tanh_p3, 5.12229709037114E-08f);
+_PS512_CONST(cephes_tanh_p4, 1.48572235717979E-05f);
+_PS512_CONST(cephes_tanh_p5, 6.37261928875436E-04f);
+_PS512_CONST(cephes_tanh_p6, 4.89352455891786E-03f);
+
+_PS512_CONST(cephes_tanh_p7, 1.19825839466702e-06f);
+_PS512_CONST(cephes_tanh_p8, 1.18534705686654e-04f);
+_PS512_CONST(cephes_tanh_p9, 2.26843463243900e-03f);
+
+// an approximation of tanh
+static inline __m512 tanh512_ps(const __m512 x)
+{
+    __m512 value = x;
+    value = _mm512_max_ps(*(__m512*)_ps512_tanh_lo, value);
+    value = _mm512_min_ps(*(__m512*)_ps512_tanh_hi, value);
+
+    __m512 value_squared = _mm512_mul_ps(value, value);
+
+    __m512 p;
+    p = _mm512_fmadd_ps(value_squared, *(__m512*)_ps512_cephes_tanh_p0, *(__m512*)_ps512_cephes_tanh_p1);
+    p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p2);
+    p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p3);
+    p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p4);
+    p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p5);
+    p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p6);
+    p = _mm512_mul_ps(p, value);
+
+    __m512 q;
+    q = _mm512_fmadd_ps(value_squared, *(__m512*)_ps512_cephes_tanh_p7, *(__m512*)_ps512_cephes_tanh_p8);
+    q = _mm512_fmadd_ps(q, value_squared, *(__m512*)_ps512_cephes_tanh_p9);
+    q = _mm512_fmadd_ps(q, value_squared, *(__m512*)_ps512_cephes_tanh_p6);
+
+    __m512 dst = _mm512_div_ps(p, q);
+    return dst;
+}
+
 _PS512_CONST(minus_cephes_DP1, -0.78515625f);
 _PS512_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
 _PS512_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h
index db28691344a..645c399e4eb 100644
--- a/src/layer/x86/avx_mathfun.h
+++ b/src/layer/x86/avx_mathfun.h
@@ -295,6 +295,48 @@ static NCNN_FORCEINLINE __m256 exp256_ps(__m256 x)
     return y;
 }
 
+_PS256_CONST(tanh_hi, 9.0f);
+_PS256_CONST(tanh_lo, -9.0f);
+
+_PS256_CONST(cephes_tanh_p0, -2.76076847742355E-16f);
+_PS256_CONST(cephes_tanh_p1, 2.00018790482477E-13f);
+_PS256_CONST(cephes_tanh_p2, -8.60467152213735E-11f);
+_PS256_CONST(cephes_tanh_p3, 5.12229709037114E-08f);
+_PS256_CONST(cephes_tanh_p4, 1.48572235717979E-05f);
+_PS256_CONST(cephes_tanh_p5, 6.37261928875436E-04f);
+_PS256_CONST(cephes_tanh_p6, 4.89352455891786E-03f);
+
+_PS256_CONST(cephes_tanh_p7, 1.19825839466702e-06f);
+_PS256_CONST(cephes_tanh_p8, 1.18534705686654e-04f);
+_PS256_CONST(cephes_tanh_p9, 2.26843463243900e-03f);
+
+// an approximation of tanh
+static inline __m256 tanh256_ps(const __m256 x)
+{
+    __m256 value = x;
+    value = _mm256_max_ps(*(__m256*)_ps256_tanh_lo, value);
+    value = _mm256_min_ps(*(__m256*)_ps256_tanh_hi, value);
+
+    __m256 value_squared = _mm256_mul_ps(value, value);
+
+    __m256 p;
+    p = _mm256_comp_fmadd_ps(value_squared, *(__m256*)_ps256_cephes_tanh_p0, *(__m256*)_ps256_cephes_tanh_p1);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p2);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p3);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p4);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p5);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p6);
+    p = _mm256_mul_ps(p, value);
+
+    __m256 q;
+    q = _mm256_comp_fmadd_ps(value_squared, *(__m256*)_ps256_cephes_tanh_p7, *(__m256*)_ps256_cephes_tanh_p8);
+    q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256*)_ps256_cephes_tanh_p9);
+    q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256*)_ps256_cephes_tanh_p6);
+
+    __m256 dst = _mm256_div_ps(p, q);
+    return dst;
+}
+
 _PS256_CONST(minus_cephes_DP1, -0.78515625f);
 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
diff --git a/src/layer/x86/convolution_3x3_pack16to1.h b/src/layer/x86/convolution_3x3_pack16to1.h
index e01383c8204..0faefbbc419 100644
--- a/src/layer/x86/convolution_3x3_pack16to1.h
+++ b/src/layer/x86/convolution_3x3_pack16to1.h
@@ -290,7 +290,7 @@ static void conv3x3s1_winograd63_pack16to1_avx512(const Mat& bottom_blob, Mat& t
                     __m512 _re = _mm512_loadu_ps(r0 + 16 * 14);
                     __m512 _rf = _mm512_loadu_ps(r0 + 16 * 15);
 
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
@@ -333,41 +333,7 @@ static void conv3x3s1_winograd63_pack16to1_avx512(const Mat& bottom_blob, Mat& t
                     __m512 _r6 = _mm512_load_ps(r0 + 16 * 6);
                     __m512 _r7 = _mm512_load_ps(r0 + 16 * 7);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-
-                    __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm512_store_ps(tmpptr, _r0);
                     _mm512_store_ps(tmpptr + 16, _r1);
diff --git a/src/layer/x86/convolution_3x3_pack8to1_int8.h b/src/layer/x86/convolution_3x3_pack8to1_int8.h
index 4f687ac256d..d5957faf6d8 100644
--- a/src/layer/x86/convolution_3x3_pack8to1_int8.h
+++ b/src/layer/x86/convolution_3x3_pack8to1_int8.h
@@ -125,11 +125,6 @@ static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(const Mat& k
     int p = 0;
     for (; p + 3 < outch; p += 4)
     {
-        const Mat k0 = kernel_tm.channel(p);
-        const Mat k1 = kernel_tm.channel(p + 1);
-        const Mat k2 = kernel_tm.channel(p + 2);
-        const Mat k3 = kernel_tm.channel(p + 3);
-
         Mat g0 = kernel_tm_pack8to1.channel(p / 4);
 
         for (int k = 0; k < 36; k++)
@@ -138,41 +133,15 @@ static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(const Mat& k
 
             for (int q = 0; q + 7 < inch; q += 8)
             {
-#if __AVXVNNI__ || __AVX512VNNI__ || __XOP__
                 for (int i = 0; i < 4; i++)
                 {
-                    const short* k00 = k0.row<const short>(q + i * 2);
-                    const short* k10 = k1.row<const short>(q + i * 2);
-                    const short* k20 = k2.row<const short>(q + i * 2);
-                    const short* k30 = k3.row<const short>(q + i * 2);
-
-                    const short* k01 = k0.row<const short>(q + i * 2 + 1);
-                    const short* k11 = k1.row<const short>(q + i * 2 + 1);
-                    const short* k21 = k2.row<const short>(q + i * 2 + 1);
-                    const short* k31 = k3.row<const short>(q + i * 2 + 1);
-
-                    g00[0] = k00[k];
-                    g00[1] = k01[k];
-                    g00[2] = k10[k];
-                    g00[3] = k11[k];
-                    g00[4] = k20[k];
-                    g00[5] = k21[k];
-                    g00[6] = k30[k];
-                    g00[7] = k31[k];
-
-                    g00 += 8;
-                }
-#else
-                for (int i = 0; i < 8; i++)
-                {
-                    g00[0] = k0.row<const short>(q + i)[k];
-                    g00[1] = k1.row<const short>(q + i)[k];
-                    g00[2] = k2.row<const short>(q + i)[k];
-                    g00[3] = k3.row<const short>(q + i)[k];
-
-                    g00 += 4;
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const short* k00 = kernel_tm.channel(p + i).row<const short>(q + j);
+                        g00[0] = k00[k];
+                        g00 += 1;
+                    }
                 }
-#endif
             }
         }
     }
@@ -508,114 +477,97 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
 
                     int nn = inch; // inch always > 0
 
-                    __m256i _sum0_1 = _mm256_setzero_si256();
-                    __m256i _sum2_3 = _mm256_setzero_si256();
-                    __m256i _sum4_5 = _mm256_setzero_si256();
-                    __m256i _sum6_7 = _mm256_setzero_si256();
+                    __m256i _sum00_11 = _mm256_setzero_si256();
+                    __m256i _sum10_01 = _mm256_setzero_si256();
+                    __m256i _sum02_13 = _mm256_setzero_si256();
+                    __m256i _sum12_03 = _mm256_setzero_si256();
+
+                    __m256i _sum04_15 = _mm256_setzero_si256();
+                    __m256i _sum14_05 = _mm256_setzero_si256();
+                    __m256i _sum06_17 = _mm256_setzero_si256();
+                    __m256i _sum16_07 = _mm256_setzero_si256();
 
                     for (int j = 0; j < nn; j++)
                     {
                         // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val0 = _mm256_loadu_si256((const __m256i*)r0);
+                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
 
                         __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
                         __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
 
+                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
+
 #if __AVXVNNI__ || __AVX512VNNI__
-                        __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
-                        __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
-                        __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4));
-                        __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6));
-
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val0_0123);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val0_89ab);
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val0_4567);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val0_cdef);
+                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
+                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
+                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
+                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
 #else
-                        // 0 0 1 1 2 2 3 3 8 8 9 9 a a b b
-                        // 4 4 5 5 6 6 7 7 c c d d e e f f
-                        __m256i _val0_0123_89ab = _mm256_unpacklo_epi16(_val0, _val0);
-                        __m256i _val0_4567_cdef = _mm256_unpackhi_epi16(_val0, _val0);
-
-                        __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-                        __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-
-                        __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val0_0123);
-                        __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val0_0123);
-                        __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val0_89ab);
-                        __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val0_89ab);
-                        __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val0_4567);
-                        __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val0_4567);
-                        __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val0_cdef);
-                        __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val0_cdef);
-
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13));
+                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
+                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
+                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
+                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
 #endif
 
-                        __m256i _val1 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
+                        __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
+
+                        __m256i _val32 = _mm256_permute4x64_epi64(_val23, 78);
 
 #if __AVXVNNI__ || __AVX512VNNI__
-                        __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
-                        __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
-                        __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4));
-                        __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6));
-
-                        _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w01, _val1_0123);
-                        _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w01, _val1_89ab);
-                        _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w23, _val1_4567);
-                        _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w23, _val1_cdef);
+                        _sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23, _w01);
+                        _sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32, _w01);
+                        _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23, _w23);
+                        _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32, _w23);
 #else
-                        __m256i _val1_0123_89ab = _mm256_unpacklo_epi16(_val1, _val1);
-                        __m256i _val1_4567_cdef = _mm256_unpackhi_epi16(_val1, _val1);
-
-                        __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-                        __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-
-                        __m256i _sl04_05 = _mm256_mullo_epi16(_w01, _val1_0123);
-                        __m256i _sh04_05 = _mm256_mulhi_epi16(_w01, _val1_0123);
-                        __m256i _sl14_15 = _mm256_mullo_epi16(_w01, _val1_89ab);
-                        __m256i _sh14_15 = _mm256_mulhi_epi16(_w01, _val1_89ab);
-                        __m256i _sl06_07 = _mm256_mullo_epi16(_w23, _val1_4567);
-                        __m256i _sh06_07 = _mm256_mulhi_epi16(_w23, _val1_4567);
-                        __m256i _sl16_17 = _mm256_mullo_epi16(_w23, _val1_cdef);
-                        __m256i _sh16_17 = _mm256_mulhi_epi16(_w23, _val1_cdef);
-
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl04_05, _sh04_05));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl14_15, _sh14_15));
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl06_07, _sh06_07));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl16_17, _sh16_17));
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl04_05, _sh04_05));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl14_15, _sh14_15));
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl06_07, _sh06_07));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl16_17, _sh16_17));
+                        _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23, _w01));
+                        _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32, _w01));
+                        _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23, _w23));
+                        _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32, _w23));
 #endif
 
                         r0 += 32;
                         k0 += 32;
                     }
 
-                    __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0));
-                    __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1));
-                    _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3);
+                    // transpose 4x8
+                    {
+                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
+                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
+                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
+                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
+                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+                    {
+                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05);
+                        _tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07);
+                        _tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05);
+                        _tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07);
+                        _sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
+                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
 
-                    __m256i _sum4_6 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 2, 0, 0));
-                    __m256i _sum5_7 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 3, 0, 1));
-                    _sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7);
+                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05);
+                    _sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07);
+                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17);
+
+                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
+                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
+                    _sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask);
 
                     int sum[16];
-                    _mm256_storeu_si256((__m256i*)sum, _sum0_2);
-                    _mm256_storeu_si256((__m256i*)(sum + 8), _sum4_6);
+                    _mm256_storeu_si256((__m256i*)sum, _sum00_11);
+                    _mm256_storeu_si256((__m256i*)(sum + 8), _sum04_15);
 
                     output0_tm[0] = sum[0];
                     output1_tm[0] = sum[1];
@@ -651,60 +603,42 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
                     int nn = inch; // inch always > 0
 
 #if __AVX2__
-                    __m256i _sum0_1 = _mm256_setzero_si256();
-                    __m256i _sum2_3 = _mm256_setzero_si256();
+                    __m256i _sum00_11 = _mm256_setzero_si256();
+                    __m256i _sum10_01 = _mm256_setzero_si256();
+                    __m256i _sum02_13 = _mm256_setzero_si256();
+                    __m256i _sum12_03 = _mm256_setzero_si256();
 #else
-                    __m128i _sum0 = _mm_setzero_si128();
-                    __m128i _sum1 = _mm_setzero_si128();
-                    __m128i _sum2 = _mm_setzero_si128();
-                    __m128i _sum3 = _mm_setzero_si128();
+                    __m128i _sum00 = _mm_setzero_si128();
+                    __m128i _sum01 = _mm_setzero_si128();
+                    __m128i _sum02 = _mm_setzero_si128();
+                    __m128i _sum03 = _mm_setzero_si128();
+                    __m128i _sum10 = _mm_setzero_si128();
+                    __m128i _sum11 = _mm_setzero_si128();
+                    __m128i _sum12 = _mm_setzero_si128();
+                    __m128i _sum13 = _mm_setzero_si128();
 #endif
 
                     for (int j = 0; j < nn; j++)
                     {
 #if __AVX2__
                         // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val = _mm256_loadu_si256((const __m256i*)r0);
+                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
 
                         __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
                         __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
 
+                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
+
 #if __AVXVNNI__ || __AVX512VNNI__
-                        __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
-                        __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
-                        __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4));
-                        __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6));
-
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val_89ab);
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val_cdef);
+                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
+                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
+                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
+                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
 #else
-                        __m256i _val_0123_89ab = _mm256_unpacklo_epi16(_val, _val);
-                        __m256i _val_4567_cdef = _mm256_unpackhi_epi16(_val, _val);
-
-                        __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-                        __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-
-                        __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123);
-                        __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123);
-                        __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val_89ab);
-                        __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val_89ab);
-                        __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567);
-                        __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567);
-                        __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val_cdef);
-                        __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val_cdef);
-
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13));
+                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
+                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
+                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
+                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
 #endif
 #else
                         // 0 1 2 3 4 5 6 7
@@ -717,75 +651,23 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
                         __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
 
 #if __XOP__
-                        __m128i _val0_01 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val0_23 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val0_45 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val0_67 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m128i _val1_01 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val1_23 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val1_45 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val1_67 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        _sum0 = _mm_maddd_epi16(_val0_01, _w0, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val0_23, _w1, _sum1);
-                        _sum2 = _mm_maddd_epi16(_val1_01, _w0, _sum2);
-                        _sum3 = _mm_maddd_epi16(_val1_23, _w1, _sum3);
-                        _sum0 = _mm_maddd_epi16(_val0_45, _w2, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val0_67, _w3, _sum1);
-                        _sum2 = _mm_maddd_epi16(_val1_45, _w2, _sum2);
-                        _sum3 = _mm_maddd_epi16(_val1_67, _w3, _sum3);
+                        _sum00 = _mm_maddd_epi16(_val0, _w0, _sum00);
+                        _sum01 = _mm_maddd_epi16(_val0, _w1, _sum01);
+                        _sum02 = _mm_maddd_epi16(_val0, _w2, _sum02);
+                        _sum03 = _mm_maddd_epi16(_val0, _w3, _sum03);
+                        _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
+                        _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
+                        _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
+                        _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
 #else
-                        // 0 0 1 1 2 2 3 3
-                        // 4 4 5 5 6 6 7 7
-                        __m128i _val0_0123 = _mm_unpacklo_epi16(_val0, _val0);
-                        __m128i _val0_4567 = _mm_unpackhi_epi16(_val0, _val0);
-
-                        __m128i _val1_0123 = _mm_unpacklo_epi16(_val1, _val1);
-                        __m128i _val1_4567 = _mm_unpackhi_epi16(_val1, _val1);
-
-                        __m128i _val0_01 = _mm_unpacklo_epi32(_val0_0123, _val0_0123);
-                        __m128i _val0_23 = _mm_unpackhi_epi32(_val0_0123, _val0_0123);
-                        __m128i _val0_45 = _mm_unpacklo_epi32(_val0_4567, _val0_4567);
-                        __m128i _val0_67 = _mm_unpackhi_epi32(_val0_4567, _val0_4567);
-
-                        __m128i _val1_01 = _mm_unpacklo_epi32(_val1_0123, _val1_0123);
-                        __m128i _val1_23 = _mm_unpackhi_epi32(_val1_0123, _val1_0123);
-                        __m128i _val1_45 = _mm_unpacklo_epi32(_val1_4567, _val1_4567);
-                        __m128i _val1_67 = _mm_unpackhi_epi32(_val1_4567, _val1_4567);
-
-                        __m128i _sl00 = _mm_mullo_epi16(_w0, _val0_01);
-                        __m128i _sh00 = _mm_mulhi_epi16(_w0, _val0_01);
-                        __m128i _sl10 = _mm_mullo_epi16(_w0, _val1_01);
-                        __m128i _sh10 = _mm_mulhi_epi16(_w0, _val1_01);
-                        __m128i _sl01 = _mm_mullo_epi16(_w1, _val0_23);
-                        __m128i _sh01 = _mm_mulhi_epi16(_w1, _val0_23);
-                        __m128i _sl11 = _mm_mullo_epi16(_w1, _val1_23);
-                        __m128i _sh11 = _mm_mulhi_epi16(_w1, _val1_23);
-                        __m128i _sl02 = _mm_mullo_epi16(_w2, _val0_45);
-                        __m128i _sh02 = _mm_mulhi_epi16(_w2, _val0_45);
-                        __m128i _sl12 = _mm_mullo_epi16(_w2, _val1_45);
-                        __m128i _sh12 = _mm_mulhi_epi16(_w2, _val1_45);
-                        __m128i _sl03 = _mm_mullo_epi16(_w3, _val0_67);
-                        __m128i _sh03 = _mm_mulhi_epi16(_w3, _val0_67);
-                        __m128i _sl13 = _mm_mullo_epi16(_w3, _val1_67);
-                        __m128i _sh13 = _mm_mulhi_epi16(_w3, _val1_67);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl10, _sh10));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl10, _sh10));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl01, _sh01));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl01, _sh01));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl11, _sh11));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl11, _sh11));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl02, _sh02));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl02, _sh02));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl12, _sh12));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl12, _sh12));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl03, _sh03));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl03, _sh03));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl13, _sh13));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl13, _sh13));
+                        _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
+                        _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
+                        _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
+                        _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
+                        _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
+                        _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
+                        _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
+                        _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
 #endif
 #endif
 
@@ -794,19 +676,64 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
                     }
 
 #if __AVX2__
-                    __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0));
-                    __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1));
-                    _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3);
+                    // transpose 4x8
+                    {
+                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
+                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
+                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
+                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
+                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
+                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
+
+                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
+                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
 
                     int sum[8];
-                    _mm256_storeu_si256((__m256i*)sum, _sum0_2);
+                    _mm256_storeu_si256((__m256i*)sum, _sum00_11);
 #else
-                    _sum0 = _mm_add_epi32(_sum0, _sum1);
-                    _sum2 = _mm_add_epi32(_sum2, _sum3);
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
+                        _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
+                        _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
+                        _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
+                        _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
+                        _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
+                        _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
+                        _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
+                        _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+
+                    _sum00 = _mm_add_epi32(_sum00, _sum01);
+                    _sum02 = _mm_add_epi32(_sum02, _sum03);
+                    _sum10 = _mm_add_epi32(_sum10, _sum11);
+                    _sum12 = _mm_add_epi32(_sum12, _sum13);
+
+                    _sum00 = _mm_add_epi32(_sum00, _sum02);
+                    _sum10 = _mm_add_epi32(_sum10, _sum12);
 
                     int sum[8];
-                    _mm_storeu_si128((__m128i*)sum, _sum0);
-                    _mm_storeu_si128((__m128i*)(sum + 4), _sum2);
+                    _mm_storeu_si128((__m128i*)sum, _sum00);
+                    _mm_storeu_si128((__m128i*)(sum + 4), _sum10);
 #endif
 
                     output0_tm[0] = sum[0];
@@ -835,9 +762,12 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
 
 #if __AVX2__
                     __m256i _sum0_1 = _mm256_setzero_si256();
+                    __m256i _sum2_3 = _mm256_setzero_si256();
 #else
                     __m128i _sum0 = _mm_setzero_si128();
                     __m128i _sum1 = _mm_setzero_si128();
+                    __m128i _sum2 = _mm_setzero_si128();
+                    __m128i _sum3 = _mm_setzero_si128();
 #endif
 
                     for (int j = 0; j < nn; j++)
@@ -849,37 +779,14 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
                         __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
                         __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
 
+                        __m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1);
+
 #if __AVXVNNI__ || __AVX512VNNI__
-                        // 0 1 0 1 x x x x
-                        // 0 1 0 1 0 1 0 1
-                        __m128i _val_01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val_23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val_45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val_67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        __m256i _val_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_01), _val_23, 1);
-                        __m256i _val_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_45), _val_67, 1);
-
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123);
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567);
+                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01);
+                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23);
 #else
-                        // 0 0 1 1 2 2 3 3
-                        // 4 4 5 5 6 6 7 7
-                        __m256i _val_0123 = _mm256_castsi128_si256(_mm_unpacklo_epi16(_val, _val));
-                        __m256i _val_4567 = _mm256_castsi128_si256(_mm_unpackhi_epi16(_val, _val));
-
-                        _val_0123 = _mm256_permutevar8x32_epi32(_val_0123, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        _val_4567 = _mm256_permutevar8x32_epi32(_val_4567, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-
-                        __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123);
-                        __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123);
-                        __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567);
-                        __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567);
-
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03));
+                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01));
+                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23));
 #endif
 #else
                         __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
@@ -888,43 +795,15 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
                         __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
 
 #if __XOP__
-                        __m128i _val01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        _sum0 = _mm_maddd_epi16(_val01, _w0, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val23, _w1, _sum1);
-                        _sum0 = _mm_maddd_epi16(_val45, _w2, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val67, _w3, _sum1);
+                        _sum0 = _mm_maddd_epi16(_val, _w0, _sum0);
+                        _sum1 = _mm_maddd_epi16(_val, _w1, _sum1);
+                        _sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
+                        _sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
 #else
-                        // 0 0 1 1 2 2 3 3
-                        // 4 4 5 5 6 6 7 7
-                        __m128i _val_0123 = _mm_unpacklo_epi16(_val, _val);
-                        __m128i _val_4567 = _mm_unpackhi_epi16(_val, _val);
-
-                        __m128i _val01 = _mm_unpacklo_epi32(_val_0123, _val_0123);
-                        __m128i _val23 = _mm_unpackhi_epi32(_val_0123, _val_0123);
-                        __m128i _val45 = _mm_unpacklo_epi32(_val_4567, _val_4567);
-                        __m128i _val67 = _mm_unpackhi_epi32(_val_4567, _val_4567);
-
-                        __m128i _sl0 = _mm_mullo_epi16(_w0, _val01);
-                        __m128i _sh0 = _mm_mulhi_epi16(_w0, _val01);
-                        __m128i _sl1 = _mm_mullo_epi16(_w1, _val23);
-                        __m128i _sh1 = _mm_mulhi_epi16(_w1, _val23);
-                        __m128i _sl2 = _mm_mullo_epi16(_w2, _val45);
-                        __m128i _sh2 = _mm_mulhi_epi16(_w2, _val45);
-                        __m128i _sl3 = _mm_mullo_epi16(_w3, _val67);
-                        __m128i _sh3 = _mm_mulhi_epi16(_w3, _val67);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl1, _sh1));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl2, _sh2));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl2, _sh2));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl3, _sh3));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl3, _sh3));
+                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
+                        _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
+                        _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
+                        _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
 #endif
 #endif
 
@@ -935,8 +814,27 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
 #if __AVX2__
                     __m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0);
                     __m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1);
+                    __m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0);
+                    __m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1);
 #endif
+
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
+                        _tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
+                        _tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
+                        _tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
+                        _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+
                     _sum0 = _mm_add_epi32(_sum0, _sum1);
+                    _sum2 = _mm_add_epi32(_sum2, _sum3);
+
+                    _sum0 = _mm_add_epi32(_sum0, _sum2);
 
                     int sum[4];
                     _mm_storeu_si128((__m128i*)sum, _sum0);
@@ -973,55 +871,38 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
                     const short* r0 = bb2.row<const short>(i / 4);
                     const short* k0 = kernel0_tm.row<const short>(r);
 
-                    __m128i _sum0 = _mm_setzero_si128();
-                    __m128i _sum1 = _mm_setzero_si128();
-                    __m128i _sum2 = _mm_setzero_si128();
-                    __m128i _sum3 = _mm_setzero_si128();
-                    __m128i _sum4 = _mm_setzero_si128();
-                    __m128i _sum5 = _mm_setzero_si128();
-                    __m128i _sum6 = _mm_setzero_si128();
-                    __m128i _sum7 = _mm_setzero_si128();
+                    __m256i _sum01 = _mm256_setzero_si256();
+                    __m256i _sum23 = _mm256_setzero_si256();
 
                     for (int q = 0; q < inch; q++)
                     {
-                        __m128i _val0 = _mm_loadu_si128((const __m128i*)r0);
-                        __m128i _val1 = _mm_loadu_si128((const __m128i*)(r0 + 8));
-                        __m128i _val2 = _mm_loadu_si128((const __m128i*)(r0 + 16));
-                        __m128i _val3 = _mm_loadu_si128((const __m128i*)(r0 + 24));
+                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
+                        __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
 
                         __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
+                        __m256i _w01 = _mm256_inserti128_si256(_mm256_castsi128_si256(_w0), _w0, 1);
 
-                        __m128i _sl0 = _mm_mullo_epi16(_val0, _w0);
-                        __m128i _sh0 = _mm_mulhi_epi16(_val0, _w0);
-                        __m128i _sl1 = _mm_mullo_epi16(_val1, _w0);
-                        __m128i _sh1 = _mm_mulhi_epi16(_val1, _w0);
-                        __m128i _sl2 = _mm_mullo_epi16(_val2, _w0);
-                        __m128i _sh2 = _mm_mulhi_epi16(_val2, _w0);
-                        __m128i _sl3 = _mm_mullo_epi16(_val3, _w0);
-                        __m128i _sh3 = _mm_mulhi_epi16(_val3, _w0);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl1, _sh1));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl1, _sh1));
-                        _sum4 = _mm_add_epi32(_sum4, _mm_unpacklo_epi16(_sl2, _sh2));
-                        _sum5 = _mm_add_epi32(_sum5, _mm_unpackhi_epi16(_sl2, _sh2));
-                        _sum6 = _mm_add_epi32(_sum6, _mm_unpacklo_epi16(_sl3, _sh3));
-                        _sum7 = _mm_add_epi32(_sum7, _mm_unpackhi_epi16(_sl3, _sh3));
+#if __AVXVNNI__ || __AVX512VNNI__
+                        _sum01 = _mm256_dpwssd_epi32(_sum01, _val01, _w01);
+                        _sum23 = _mm256_dpwssd_epi32(_sum23, _val23, _w01);
+#else
+                        _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01, _w01));
+                        _sum23 = _mm256_add_epi32(_sum23, _mm256_madd_epi16(_val23, _w01));
+#endif
 
                         k0 += 8;
                         r0 += 32;
                     }
 
-                    _sum0 = _mm_add_epi32(_sum0, _sum1);
-                    _sum2 = _mm_add_epi32(_sum2, _sum3);
-                    _sum4 = _mm_add_epi32(_sum4, _sum5);
-                    _sum6 = _mm_add_epi32(_sum6, _sum7);
+                    __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
+                    __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
+                    __m128i _sum2 = _mm256_extracti128_si256(_sum23, 0);
+                    __m128i _sum3 = _mm256_extracti128_si256(_sum23, 1);
 
                     output0_tm[0] = _mm_reduce_add_epi32(_sum0);
-                    output0_tm[1] = _mm_reduce_add_epi32(_sum2);
-                    output0_tm[2] = _mm_reduce_add_epi32(_sum4);
-                    output0_tm[3] = _mm_reduce_add_epi32(_sum6);
+                    output0_tm[1] = _mm_reduce_add_epi32(_sum1);
+                    output0_tm[2] = _mm_reduce_add_epi32(_sum2);
+                    output0_tm[3] = _mm_reduce_add_epi32(_sum3);
                     output0_tm += 4;
                 }
 #endif
@@ -1034,37 +915,52 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
 #endif
                     const short* k0 = kernel0_tm.row<const short>(r);
 
+#if __AVX2__
+                    __m256i _sum01 = _mm256_setzero_si256();
+#else
                     __m128i _sum0 = _mm_setzero_si128();
                     __m128i _sum1 = _mm_setzero_si128();
-                    __m128i _sum2 = _mm_setzero_si128();
-                    __m128i _sum3 = _mm_setzero_si128();
+#endif
 
                     for (int q = 0; q < inch; q++)
                     {
+#if __AVX2__
+                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
+
+                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
+                        __m256i _w01 = _mm256_inserti128_si256(_mm256_castsi128_si256(_w0), _w0, 1);
+
+#if __AVXVNNI__ || __AVX512VNNI__
+                        _sum01 = _mm256_dpwssd_epi32(_sum01, _val01, _w01);
+#else
+                        _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01, _w01));
+#endif
+#else
                         __m128i _val0 = _mm_loadu_si128((const __m128i*)r0);
                         __m128i _val1 = _mm_loadu_si128((const __m128i*)(r0 + 8));
 
                         __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
 
-                        __m128i _sl0 = _mm_mullo_epi16(_val0, _w0);
-                        __m128i _sh0 = _mm_mulhi_epi16(_val0, _w0);
-                        __m128i _sl1 = _mm_mullo_epi16(_val1, _w0);
-                        __m128i _sh1 = _mm_mulhi_epi16(_val1, _w0);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl1, _sh1));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl1, _sh1));
+#if __XOP__
+                        _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
+                        _sum1 = _mm_maddd_epi16(_val1, _w0, _sum1);
+#else
+                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
+                        _sum1 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum1);
+#endif
+#endif
 
                         k0 += 8;
                         r0 += 16;
                     }
 
-                    _sum0 = _mm_add_epi32(_sum0, _sum1);
-                    _sum2 = _mm_add_epi32(_sum2, _sum3);
+#if __AVX2__
+                    __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
+                    __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
+#endif
 
                     output0_tm[0] = _mm_reduce_add_epi32(_sum0);
-                    output0_tm[1] = _mm_reduce_add_epi32(_sum2);
+                    output0_tm[1] = _mm_reduce_add_epi32(_sum1);
                     output0_tm += 2;
                 }
                 for (; i < tiles; i++)
@@ -1077,26 +973,23 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
                     const short* k0 = kernel0_tm.row<const short>(r);
 
                     __m128i _sum0 = _mm_setzero_si128();
-                    __m128i _sum1 = _mm_setzero_si128();
 
                     for (int q = 0; q < inch; q++)
                     {
-                        __m128i _val = _mm_loadu_si128((const __m128i*)r0);
+                        __m128i _val0 = _mm_loadu_si128((const __m128i*)r0);
 
                         __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
 
-                        __m128i _sl0 = _mm_mullo_epi16(_val, _w0);
-                        __m128i _sh0 = _mm_mulhi_epi16(_val, _w0);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0));
+#if __XOP__
+                        _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
+#else
+                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
+#endif
 
                         k0 += 8;
                         r0 += 8;
                     }
 
-                    _sum0 = _mm_add_epi32(_sum0, _sum1);
-
                     output0_tm[0] = _mm_reduce_add_epi32(_sum0);
                     output0_tm++;
                 }
diff --git a/src/layer/x86/convolution_3x3_pack8to4_int8.h b/src/layer/x86/convolution_3x3_pack8to4_int8.h
index 547de8774a6..2bb48ce1903 100644
--- a/src/layer/x86/convolution_3x3_pack8to4_int8.h
+++ b/src/layer/x86/convolution_3x3_pack8to4_int8.h
@@ -125,59 +125,23 @@ static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(const Mat& k
     int q = 0;
     for (; q + 3 < outch; q += 4)
     {
-        const Mat k0 = kernel_tm.channel(q);
-        const Mat k1 = kernel_tm.channel(q + 1);
-        const Mat k2 = kernel_tm.channel(q + 2);
-        const Mat k3 = kernel_tm.channel(q + 3);
-
-        Mat kernel_tm = kernel_tm_pack8.channel(q / 4);
+        Mat g0 = kernel_tm_pack8.channel(q / 4);
 
         for (int k = 0; k < 36; k++)
         {
-            short* g00 = kernel_tm.row<short>(k);
+            short* g00 = g0.row<short>(k);
 
             for (int p = 0; p + 7 < inch; p += 8)
             {
-#if __AVXVNNI__ || __AVX512VNNI__ || __XOP__
                 for (int i = 0; i < 4; i++)
                 {
-                    const short* k00 = k0.row<const short>(p + i * 2);
-                    const short* k10 = k1.row<const short>(p + i * 2);
-                    const short* k20 = k2.row<const short>(p + i * 2);
-                    const short* k30 = k3.row<const short>(p + i * 2);
-
-                    const short* k01 = k0.row<const short>(p + i * 2 + 1);
-                    const short* k11 = k1.row<const short>(p + i * 2 + 1);
-                    const short* k21 = k2.row<const short>(p + i * 2 + 1);
-                    const short* k31 = k3.row<const short>(p + i * 2 + 1);
-
-                    g00[0] = k00[k];
-                    g00[1] = k01[k];
-                    g00[2] = k10[k];
-                    g00[3] = k11[k];
-                    g00[4] = k20[k];
-                    g00[5] = k21[k];
-                    g00[6] = k30[k];
-                    g00[7] = k31[k];
-
-                    g00 += 8;
-                }
-#else
-                for (int i = 0; i < 8; i++)
-                {
-                    const short* k00 = k0.row<const short>(p + i);
-                    const short* k10 = k1.row<const short>(p + i);
-                    const short* k20 = k2.row<const short>(p + i);
-                    const short* k30 = k3.row<const short>(p + i);
-
-                    g00[0] = k00[k];
-                    g00[1] = k10[k];
-                    g00[2] = k20[k];
-                    g00[3] = k30[k];
-
-                    g00 += 4;
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const short* k00 = kernel_tm.channel(q + i).row<const short>(p + j);
+                        g00[0] = k00[k];
+                        g00 += 1;
+                    }
                 }
-#endif
             }
         }
     }
@@ -482,113 +446,96 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
 
                     int nn = inch; // inch always > 0
 
-                    __m256i _sum0_1 = _mm256_setzero_si256();
-                    __m256i _sum2_3 = _mm256_setzero_si256();
-                    __m256i _sum4_5 = _mm256_setzero_si256();
-                    __m256i _sum6_7 = _mm256_setzero_si256();
+                    __m256i _sum00_11 = _mm256_setzero_si256();
+                    __m256i _sum10_01 = _mm256_setzero_si256();
+                    __m256i _sum02_13 = _mm256_setzero_si256();
+                    __m256i _sum12_03 = _mm256_setzero_si256();
+
+                    __m256i _sum04_15 = _mm256_setzero_si256();
+                    __m256i _sum14_05 = _mm256_setzero_si256();
+                    __m256i _sum06_17 = _mm256_setzero_si256();
+                    __m256i _sum16_07 = _mm256_setzero_si256();
 
                     for (int j = 0; j < nn; j++)
                     {
                         // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val0 = _mm256_loadu_si256((const __m256i*)r0);
+                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
 
                         __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
                         __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
 
+                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
+
 #if __AVXVNNI__ || __AVX512VNNI__
-                        __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
-                        __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
-                        __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4));
-                        __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6));
-
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val0_0123);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val0_89ab);
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val0_4567);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val0_cdef);
+                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
+                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
+                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
+                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
 #else
-                        // 0 0 1 1 2 2 3 3 8 8 9 9 a a b b
-                        // 4 4 5 5 6 6 7 7 c c d d e e f f
-                        __m256i _val0_0123_89ab = _mm256_unpacklo_epi16(_val0, _val0);
-                        __m256i _val0_4567_cdef = _mm256_unpackhi_epi16(_val0, _val0);
-
-                        __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-                        __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-
-                        __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val0_0123);
-                        __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val0_0123);
-                        __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val0_89ab);
-                        __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val0_89ab);
-                        __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val0_4567);
-                        __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val0_4567);
-                        __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val0_cdef);
-                        __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val0_cdef);
-
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13));
+                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
+                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
+                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
+                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
 #endif
 
-                        __m256i _val1 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
+                        __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
+
+                        __m256i _val32 = _mm256_permute4x64_epi64(_val23, 78);
 
 #if __AVXVNNI__ || __AVX512VNNI__
-                        __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
-                        __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
-                        __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4));
-                        __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6));
-
-                        _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w01, _val1_0123);
-                        _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w01, _val1_89ab);
-                        _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w23, _val1_4567);
-                        _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w23, _val1_cdef);
+                        _sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23, _w01);
+                        _sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32, _w01);
+                        _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23, _w23);
+                        _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32, _w23);
 #else
-                        __m256i _val1_0123_89ab = _mm256_unpacklo_epi16(_val1, _val1);
-                        __m256i _val1_4567_cdef = _mm256_unpackhi_epi16(_val1, _val1);
-
-                        __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-                        __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-
-                        __m256i _sl04_05 = _mm256_mullo_epi16(_w01, _val1_0123);
-                        __m256i _sh04_05 = _mm256_mulhi_epi16(_w01, _val1_0123);
-                        __m256i _sl14_15 = _mm256_mullo_epi16(_w01, _val1_89ab);
-                        __m256i _sh14_15 = _mm256_mulhi_epi16(_w01, _val1_89ab);
-                        __m256i _sl06_07 = _mm256_mullo_epi16(_w23, _val1_4567);
-                        __m256i _sh06_07 = _mm256_mulhi_epi16(_w23, _val1_4567);
-                        __m256i _sl16_17 = _mm256_mullo_epi16(_w23, _val1_cdef);
-                        __m256i _sh16_17 = _mm256_mulhi_epi16(_w23, _val1_cdef);
-
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl04_05, _sh04_05));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl14_15, _sh14_15));
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl06_07, _sh06_07));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl16_17, _sh16_17));
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl04_05, _sh04_05));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl14_15, _sh14_15));
-                        _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl06_07, _sh06_07));
-                        _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl16_17, _sh16_17));
+                        _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23, _w01));
+                        _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32, _w01));
+                        _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23, _w23));
+                        _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32, _w23));
 #endif
 
                         r0 += 32;
                         k0 += 32;
                     }
 
-                    __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0));
-                    __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1));
-                    _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3);
+                    // transpose 4x8
+                    {
+                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
+                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
+                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
+                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
+                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+                    {
+                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05);
+                        _tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07);
+                        _tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05);
+                        _tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07);
+                        _sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
+                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
 
-                    __m256i _sum4_6 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 2, 0, 0));
-                    __m256i _sum5_7 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 3, 0, 1));
-                    _sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7);
+                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05);
+                    _sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07);
+                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17);
 
-                    _mm256_storeu_si256((__m256i*)output0_tm, _sum0_2);
-                    _mm256_storeu_si256((__m256i*)(output0_tm + 8), _sum4_6);
+                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
+                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
+                    _sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask);
+
+                    _mm256_storeu_si256((__m256i*)output0_tm, _sum00_11);
+                    _mm256_storeu_si256((__m256i*)(output0_tm + 8), _sum04_15);
                     output0_tm += 16;
                 }
 #endif
@@ -604,60 +551,42 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
                     int nn = inch; // inch always > 0
 
 #if __AVX2__
-                    __m256i _sum0_1 = _mm256_setzero_si256();
-                    __m256i _sum2_3 = _mm256_setzero_si256();
+                    __m256i _sum00_11 = _mm256_setzero_si256();
+                    __m256i _sum10_01 = _mm256_setzero_si256();
+                    __m256i _sum02_13 = _mm256_setzero_si256();
+                    __m256i _sum12_03 = _mm256_setzero_si256();
 #else
-                    __m128i _sum0 = _mm_setzero_si128();
-                    __m128i _sum1 = _mm_setzero_si128();
-                    __m128i _sum2 = _mm_setzero_si128();
-                    __m128i _sum3 = _mm_setzero_si128();
+                    __m128i _sum00 = _mm_setzero_si128();
+                    __m128i _sum01 = _mm_setzero_si128();
+                    __m128i _sum02 = _mm_setzero_si128();
+                    __m128i _sum03 = _mm_setzero_si128();
+                    __m128i _sum10 = _mm_setzero_si128();
+                    __m128i _sum11 = _mm_setzero_si128();
+                    __m128i _sum12 = _mm_setzero_si128();
+                    __m128i _sum13 = _mm_setzero_si128();
 #endif
 
                     for (int j = 0; j < nn; j++)
                     {
 #if __AVX2__
                         // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val = _mm256_loadu_si256((const __m256i*)r0);
+                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
 
                         __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
                         __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
 
+                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
+
 #if __AVXVNNI__ || __AVX512VNNI__
-                        __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
-                        __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
-                        __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4));
-                        __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6));
-
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val_89ab);
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val_cdef);
+                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
+                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
+                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
+                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
 #else
-                        __m256i _val_0123_89ab = _mm256_unpacklo_epi16(_val, _val);
-                        __m256i _val_4567_cdef = _mm256_unpackhi_epi16(_val, _val);
-
-                        __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-                        __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-
-                        __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123);
-                        __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123);
-                        __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val_89ab);
-                        __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val_89ab);
-                        __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567);
-                        __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567);
-                        __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val_cdef);
-                        __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val_cdef);
-
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13));
+                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
+                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
+                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
+                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
 #endif
 #else
                         // 0 1 2 3 4 5 6 7
@@ -670,75 +599,23 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
                         __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
 
 #if __XOP__
-                        __m128i _val0_01 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val0_23 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val0_45 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val0_67 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m128i _val1_01 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val1_23 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val1_45 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val1_67 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        _sum0 = _mm_maddd_epi16(_val0_01, _w0, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val0_23, _w1, _sum1);
-                        _sum2 = _mm_maddd_epi16(_val1_01, _w0, _sum2);
-                        _sum3 = _mm_maddd_epi16(_val1_23, _w1, _sum3);
-                        _sum0 = _mm_maddd_epi16(_val0_45, _w2, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val0_67, _w3, _sum1);
-                        _sum2 = _mm_maddd_epi16(_val1_45, _w2, _sum2);
-                        _sum3 = _mm_maddd_epi16(_val1_67, _w3, _sum3);
+                        _sum00 = _mm_maddd_epi16(_val0, _w0, _sum00);
+                        _sum01 = _mm_maddd_epi16(_val0, _w1, _sum01);
+                        _sum02 = _mm_maddd_epi16(_val0, _w2, _sum02);
+                        _sum03 = _mm_maddd_epi16(_val0, _w3, _sum03);
+                        _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
+                        _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
+                        _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
+                        _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
 #else
-                        // 0 0 1 1 2 2 3 3
-                        // 4 4 5 5 6 6 7 7
-                        __m128i _val0_0123 = _mm_unpacklo_epi16(_val0, _val0);
-                        __m128i _val0_4567 = _mm_unpackhi_epi16(_val0, _val0);
-
-                        __m128i _val1_0123 = _mm_unpacklo_epi16(_val1, _val1);
-                        __m128i _val1_4567 = _mm_unpackhi_epi16(_val1, _val1);
-
-                        __m128i _val0_01 = _mm_unpacklo_epi32(_val0_0123, _val0_0123);
-                        __m128i _val0_23 = _mm_unpackhi_epi32(_val0_0123, _val0_0123);
-                        __m128i _val0_45 = _mm_unpacklo_epi32(_val0_4567, _val0_4567);
-                        __m128i _val0_67 = _mm_unpackhi_epi32(_val0_4567, _val0_4567);
-
-                        __m128i _val1_01 = _mm_unpacklo_epi32(_val1_0123, _val1_0123);
-                        __m128i _val1_23 = _mm_unpackhi_epi32(_val1_0123, _val1_0123);
-                        __m128i _val1_45 = _mm_unpacklo_epi32(_val1_4567, _val1_4567);
-                        __m128i _val1_67 = _mm_unpackhi_epi32(_val1_4567, _val1_4567);
-
-                        __m128i _sl00 = _mm_mullo_epi16(_w0, _val0_01);
-                        __m128i _sh00 = _mm_mulhi_epi16(_w0, _val0_01);
-                        __m128i _sl10 = _mm_mullo_epi16(_w0, _val1_01);
-                        __m128i _sh10 = _mm_mulhi_epi16(_w0, _val1_01);
-                        __m128i _sl01 = _mm_mullo_epi16(_w1, _val0_23);
-                        __m128i _sh01 = _mm_mulhi_epi16(_w1, _val0_23);
-                        __m128i _sl11 = _mm_mullo_epi16(_w1, _val1_23);
-                        __m128i _sh11 = _mm_mulhi_epi16(_w1, _val1_23);
-                        __m128i _sl02 = _mm_mullo_epi16(_w2, _val0_45);
-                        __m128i _sh02 = _mm_mulhi_epi16(_w2, _val0_45);
-                        __m128i _sl12 = _mm_mullo_epi16(_w2, _val1_45);
-                        __m128i _sh12 = _mm_mulhi_epi16(_w2, _val1_45);
-                        __m128i _sl03 = _mm_mullo_epi16(_w3, _val0_67);
-                        __m128i _sh03 = _mm_mulhi_epi16(_w3, _val0_67);
-                        __m128i _sl13 = _mm_mullo_epi16(_w3, _val1_67);
-                        __m128i _sh13 = _mm_mulhi_epi16(_w3, _val1_67);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl10, _sh10));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl10, _sh10));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl01, _sh01));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl01, _sh01));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl11, _sh11));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl11, _sh11));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl02, _sh02));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl02, _sh02));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl12, _sh12));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl12, _sh12));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl03, _sh03));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl03, _sh03));
-                        _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl13, _sh13));
-                        _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl13, _sh13));
+                        _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
+                        _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
+                        _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
+                        _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
+                        _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
+                        _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
+                        _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
+                        _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
 #endif
 #endif
 
@@ -747,17 +624,62 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
                     }
 
 #if __AVX2__
-                    __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0));
-                    __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1));
-                    _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3);
+                    // transpose 4x8
+                    {
+                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
+                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
+                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
+                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
+                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
+                    }
 
-                    _mm256_storeu_si256((__m256i*)output0_tm, _sum0_2);
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
+                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
+                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
+
+                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
+                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
+
+                    _mm256_storeu_si256((__m256i*)output0_tm, _sum00_11);
 #else
-                    _sum0 = _mm_add_epi32(_sum0, _sum1);
-                    _sum2 = _mm_add_epi32(_sum2, _sum3);
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
+                        _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
+                        _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
+                        _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
+                        _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
+                        _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
+                        _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
+                        _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
+                        _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
+                    }
 
-                    _mm_storeu_si128((__m128i*)output0_tm, _sum0);
-                    _mm_storeu_si128((__m128i*)(output0_tm + 4), _sum2);
+                    _sum00 = _mm_add_epi32(_sum00, _sum01);
+                    _sum02 = _mm_add_epi32(_sum02, _sum03);
+                    _sum10 = _mm_add_epi32(_sum10, _sum11);
+                    _sum12 = _mm_add_epi32(_sum12, _sum13);
+
+                    _sum00 = _mm_add_epi32(_sum00, _sum02);
+                    _sum10 = _mm_add_epi32(_sum10, _sum12);
+
+                    _mm_storeu_si128((__m128i*)output0_tm, _sum00);
+                    _mm_storeu_si128((__m128i*)(output0_tm + 4), _sum10);
 #endif
                     output0_tm += 8;
                 }
@@ -774,9 +696,12 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
 
 #if __AVX2__
                     __m256i _sum0_1 = _mm256_setzero_si256();
+                    __m256i _sum2_3 = _mm256_setzero_si256();
 #else
                     __m128i _sum0 = _mm_setzero_si128();
                     __m128i _sum1 = _mm_setzero_si128();
+                    __m128i _sum2 = _mm_setzero_si128();
+                    __m128i _sum3 = _mm_setzero_si128();
 #endif
 
                     for (int j = 0; j < nn; j++)
@@ -787,37 +712,14 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
                         __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
                         __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
 
+                        __m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1);
+
 #if __AVXVNNI__ || __AVX512VNNI__
-                        // 0 1 0 1 x x x x
-                        // 0 1 0 1 0 1 0 1
-                        __m128i _val_01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val_23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val_45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val_67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        __m256i _val_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_01), _val_23, 1);
-                        __m256i _val_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_45), _val_67, 1);
-
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123);
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567);
+                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01);
+                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23);
 #else
-                        // 0 0 1 1 2 2 3 3
-                        // 4 4 5 5 6 6 7 7
-                        __m256i _val_0123 = _mm256_castsi128_si256(_mm_unpacklo_epi16(_val, _val));
-                        __m256i _val_4567 = _mm256_castsi128_si256(_mm_unpackhi_epi16(_val, _val));
-
-                        _val_0123 = _mm256_permutevar8x32_epi32(_val_0123, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-                        _val_4567 = _mm256_permutevar8x32_epi32(_val_4567, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-
-                        __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123);
-                        __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123);
-                        __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567);
-                        __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567);
-
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01));
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03));
+                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01));
+                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23));
 #endif
 #else
                         __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
@@ -826,43 +728,15 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
                         __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
 
 #if __XOP__
-                        __m128i _val01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m128i _val23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m128i _val45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m128i _val67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        _sum0 = _mm_maddd_epi16(_val01, _w0, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val23, _w1, _sum1);
-                        _sum0 = _mm_maddd_epi16(_val45, _w2, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val67, _w3, _sum1);
+                        _sum0 = _mm_maddd_epi16(_val, _w0, _sum0);
+                        _sum1 = _mm_maddd_epi16(_val, _w1, _sum1);
+                        _sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
+                        _sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
 #else
-                        // 0 0 1 1 2 2 3 3
-                        // 4 4 5 5 6 6 7 7
-                        __m128i _val_0123 = _mm_unpacklo_epi16(_val, _val);
-                        __m128i _val_4567 = _mm_unpackhi_epi16(_val, _val);
-
-                        __m128i _val01 = _mm_unpacklo_epi32(_val_0123, _val_0123);
-                        __m128i _val23 = _mm_unpackhi_epi32(_val_0123, _val_0123);
-                        __m128i _val45 = _mm_unpacklo_epi32(_val_4567, _val_4567);
-                        __m128i _val67 = _mm_unpackhi_epi32(_val_4567, _val_4567);
-
-                        __m128i _sl0 = _mm_mullo_epi16(_w0, _val01);
-                        __m128i _sh0 = _mm_mulhi_epi16(_w0, _val01);
-                        __m128i _sl1 = _mm_mullo_epi16(_w1, _val23);
-                        __m128i _sh1 = _mm_mulhi_epi16(_w1, _val23);
-                        __m128i _sl2 = _mm_mullo_epi16(_w2, _val45);
-                        __m128i _sh2 = _mm_mulhi_epi16(_w2, _val45);
-                        __m128i _sl3 = _mm_mullo_epi16(_w3, _val67);
-                        __m128i _sh3 = _mm_mulhi_epi16(_w3, _val67);
-
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl1, _sh1));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl2, _sh2));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl2, _sh2));
-                        _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl3, _sh3));
-                        _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl3, _sh3));
+                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
+                        _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
+                        _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
+                        _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
 #endif
 #endif
 
@@ -873,8 +747,27 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat&
 #if __AVX2__
                     __m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0);
                     __m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1);
+                    __m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0);
+                    __m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1);
 #endif
+
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
+                        _tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
+                        _tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
+                        _tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
+                        _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
+                        _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
+                        _sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
+                        _sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
+                    }
+
                     _sum0 = _mm_add_epi32(_sum0, _sum1);
+                    _sum2 = _mm_add_epi32(_sum2, _sum3);
+
+                    _sum0 = _mm_add_epi32(_sum0, _sum2);
 
                     _mm_storeu_si128((__m128i*)output0_tm, _sum0);
                     output0_tm += 4;
diff --git a/src/layer/x86/convolution_sgemm_int8.h b/src/layer/x86/convolution_sgemm_int8.h
index a533ce79d02..34097f057b7 100644
--- a/src/layer/x86/convolution_sgemm_int8.h
+++ b/src/layer/x86/convolution_sgemm_int8.h
@@ -338,17 +338,8 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
 
             if (nn4 > 0)
             {
-#if __AVXVNNI__ || __AVX512VNNI__
-                __m256i _sum10_02 = _mm256_setzero_si256();
-                __m256i _sum30_22 = _mm256_setzero_si256();
-#else
                 __m256i _sum10_02 = _mm256_setzero_si256();
-                __m256i _sum01_13 = _mm256_setzero_si256();
-                __m256i _sum11_03 = _mm256_setzero_si256();
                 __m256i _sum30_22 = _mm256_setzero_si256();
-                __m256i _sum21_33 = _mm256_setzero_si256();
-                __m256i _sum31_23 = _mm256_setzero_si256();
-#endif
 
                 int j = 0;
                 for (; j < nn4; j++)
@@ -371,72 +362,21 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
                     _sum20_32 = _mm256_dpwssd_epi32(_sum20_32, _val23_16, _w01_16);
                     _sum30_22 = _mm256_dpwssd_epi32(_sum30_22, _val32_16, _w01_16);
 #else
-                    __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                    __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                    __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                    __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-                    __m256i _sl20_31 = _mm256_mullo_epi16(_val23_16, _w01_16);
-                    __m256i _sh20_31 = _mm256_mulhi_epi16(_val23_16, _w01_16);
-                    __m256i _sl30_21 = _mm256_mullo_epi16(_val32_16, _w01_16);
-                    __m256i _sh30_21 = _mm256_mulhi_epi16(_val32_16, _w01_16);
-
-                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                    _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                    _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
-                    _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_unpacklo_epi16(_sl20_31, _sh20_31));
-                    _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_unpacklo_epi16(_sl30_21, _sh30_21));
-                    _sum21_33 = _mm256_add_epi32(_sum21_33, _mm256_unpackhi_epi16(_sl20_31, _sh20_31));
-                    _sum31_23 = _mm256_add_epi32(_sum31_23, _mm256_unpackhi_epi16(_sl30_21, _sh30_21));
+                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16));
+                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16));
+                    _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_madd_epi16(_val23_16, _w01_16));
+                    _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_madd_epi16(_val32_16, _w01_16));
 #endif
 
                     tmpptr += 16;
                     kptr0 += 16;
                 }
 
-#if __AVXVNNI__ || __AVX512VNNI__
                 _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02);
                 _sum20_32 = _mm256_hadd_epi32(_sum20_32, _sum30_22);
 
                 _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0));
                 _sum20_32 = _mm256_permute4x64_epi64(_sum20_32, _MM_SHUFFLE(2, 1, 3, 0));
-#else
-                // transpose 4x8
-                {
-                    __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02);
-                    _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03);
-                    _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02);
-                    _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03);
-                    _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                }
-                {
-                    __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm256_unpacklo_epi32(_sum20_32, _sum30_22);
-                    _tmp1 = _mm256_unpacklo_epi32(_sum21_33, _sum31_23);
-                    _tmp2 = _mm256_unpackhi_epi32(_sum20_32, _sum30_22);
-                    _tmp3 = _mm256_unpackhi_epi32(_sum21_33, _sum31_23);
-                    _sum20_32 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum30_22 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum21_33 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum31_23 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                }
-
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02);
-                _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03);
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13);
-
-                _sum20_32 = _mm256_add_epi32(_sum20_32, _sum30_22);
-                _sum21_33 = _mm256_add_epi32(_sum21_33, _sum31_23);
-                _sum20_32 = _mm256_add_epi32(_sum20_32, _sum21_33);
-
-                __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0);
-                _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask);
-                _sum20_32 = _mm256_permutevar8x32_epi32(_sum20_32, _perm_mask);
-#endif
             }
 
             __m128i _sum00 = _mm256_extracti128_si256(_sum00_12, 0);
@@ -532,25 +472,10 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
             if (nn4 > 0)
             {
 #if __AVX2__
-#if __AVXVNNI__ || __AVX512VNNI__
-                __m256i _sum10_02 = _mm256_setzero_si256();
-#else
                 __m256i _sum10_02 = _mm256_setzero_si256();
-                __m256i _sum01_13 = _mm256_setzero_si256();
-                __m256i _sum11_03 = _mm256_setzero_si256();
-#endif
-#else
-#if __XOP__
-                __m128i _sum01 = _mm_setzero_si128();
-                __m128i _sum11 = _mm_setzero_si128();
 #else
                 __m128i _sum01 = _mm_setzero_si128();
-                __m128i _sum02 = _mm_setzero_si128();
-                __m128i _sum03 = _mm_setzero_si128();
                 __m128i _sum11 = _mm_setzero_si128();
-                __m128i _sum12 = _mm_setzero_si128();
-                __m128i _sum13 = _mm_setzero_si128();
-#endif
 #endif
 
                 int j = 0;
@@ -571,15 +496,8 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
                     _sum00_12 = _mm256_dpwssd_epi32(_sum00_12, _val01_16, _w01_16);
                     _sum10_02 = _mm256_dpwssd_epi32(_sum10_02, _val10_16, _w01_16);
 #else
-                    __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                    __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                    __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                    __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-
-                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                    _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                    _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
+                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16));
+                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16));
 #endif
 #else
                     __m128i _val01 = _mm_loadl_epi64((const __m128i*)tmpptr);
@@ -604,23 +522,10 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
                     _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
                     _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
 #else
-                    __m128i _sl00 = _mm_mullo_epi16(_val0, _w0);
-                    __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0);
-                    __m128i _sl01 = _mm_mullo_epi16(_val0, _w1);
-                    __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1);
-                    __m128i _sl10 = _mm_mullo_epi16(_val1, _w0);
-                    __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0);
-                    __m128i _sl11 = _mm_mullo_epi16(_val1, _w1);
-                    __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1);
-
-                    _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00));
-                    _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl00, _sh00));
-                    _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl01, _sh01));
-                    _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl01, _sh01));
-                    _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10));
-                    _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl10, _sh10));
-                    _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl11, _sh11));
-                    _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl11, _sh11));
+                    _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
+                    _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
+                    _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
+                    _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
 #endif
 #endif
 
@@ -629,67 +534,26 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
                 }
 
 #if __AVX2__
-#if __AVXVNNI__ || __AVX512VNNI__
                 _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02);
 
                 _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0));
 #else
-                // transpose 4x8
-                {
-                    __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02);
-                    _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03);
-                    _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02);
-                    _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03);
-                    _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                }
-
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02);
-                _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03);
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13);
-
-                __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0);
-                _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask);
-#endif
-#else
-#if __XOP__
+#if __SSSE3__
                 _sum00 = _mm_hadd_epi32(_sum00, _sum01);
                 _sum10 = _mm_hadd_epi32(_sum10, _sum11);
 #else
-                // transpose 4x4
-                {
-                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
-                    _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
-                    _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
-                    _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
-                    _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                }
-                {
-                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
-                    _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
-                    _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
-                    _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
-                    _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                }
+                __m128i _sum00_sh = _mm_shuffle_epi32(_sum00, 216);
+                __m128i _sum01_sh = _mm_shuffle_epi32(_sum01, 216);
+                __m128i _sum10_sh = _mm_shuffle_epi32(_sum10, 216);
+                __m128i _sum11_sh = _mm_shuffle_epi32(_sum11, 216);
+
+                _sum00 = _mm_unpacklo_epi64(_sum00_sh, _sum01_sh);
+                _sum01 = _mm_unpackhi_epi64(_sum00_sh, _sum01_sh);
+                _sum10 = _mm_unpacklo_epi64(_sum10_sh, _sum11_sh);
+                _sum11 = _mm_unpackhi_epi64(_sum10_sh, _sum11_sh);
 
                 _sum00 = _mm_add_epi32(_sum00, _sum01);
-                _sum02 = _mm_add_epi32(_sum02, _sum03);
                 _sum10 = _mm_add_epi32(_sum10, _sum11);
-                _sum12 = _mm_add_epi32(_sum12, _sum13);
-
-                _sum00 = _mm_add_epi32(_sum00, _sum02);
-                _sum10 = _mm_add_epi32(_sum10, _sum12);
 #endif
 #endif
             }
diff --git a/src/layer/x86/convolution_sgemm_pack16.h b/src/layer/x86/convolution_sgemm_pack16.h
index 07ea17ce26c..2612c400508 100644
--- a/src/layer/x86/convolution_sgemm_pack16.h
+++ b/src/layer/x86/convolution_sgemm_pack16.h
@@ -67,57 +67,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
                     __m512 _ra = _mm512_loadu_ps(img0 + 16 * 10);
                     __m512 _rb = _mm512_loadu_ps(img0 + 16 * 11);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-                    __m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9);
-                    __m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9);
-                    __m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb);
-                    __m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb);
-
-                    __m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
-                    _ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1));
-                    _rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
@@ -164,41 +114,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
                     __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6);
                     __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-
-                    __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
@@ -237,25 +153,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
                     __m512 _r2 = _mm512_loadu_ps(img0 + 16 * 2);
                     __m512 _r3 = _mm512_loadu_ps(img0 + 16 * 3);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-
-                    __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x4_ps(_r0, _r1, _r2, _r3);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
@@ -288,14 +186,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
                     __m512 _r0 = _mm512_loadu_ps(img0);
                     __m512 _r1 = _mm512_loadu_ps(img0 + 16);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-
-                    __m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    __m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x2_ps(_r0, _r1);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack16to1.h b/src/layer/x86/convolution_sgemm_pack16to1.h
index 7b26ef27a40..a8a823a34b3 100644
--- a/src/layer/x86/convolution_sgemm_pack16to1.h
+++ b/src/layer/x86/convolution_sgemm_pack16to1.h
@@ -66,7 +66,7 @@ static void im2col_sgemm_pack16to1_avx512(const Mat& bottom_im2col, Mat& top_blo
                     __m512 _re = _mm512_loadu_ps(img0 + 16 * 14);
                     __m512 _rf = _mm512_loadu_ps(img0 + 16 * 15);
 
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
@@ -117,41 +117,7 @@ static void im2col_sgemm_pack16to1_avx512(const Mat& bottom_im2col, Mat& top_blo
                     __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6);
                     __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-
-                    __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack16to4.h b/src/layer/x86/convolution_sgemm_pack16to4.h
index 1930128c3d2..e35cc149244 100644
--- a/src/layer/x86/convolution_sgemm_pack16to4.h
+++ b/src/layer/x86/convolution_sgemm_pack16to4.h
@@ -59,41 +59,7 @@ static void im2col_sgemm_pack16to4_avx512(const Mat& bottom_im2col, Mat& top_blo
                     __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6);
                     __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-
-                    __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
@@ -132,25 +98,7 @@ static void im2col_sgemm_pack16to4_avx512(const Mat& bottom_im2col, Mat& top_blo
                     __m512 _r2 = _mm512_loadu_ps(img0 + 16 * 2);
                     __m512 _r3 = _mm512_loadu_ps(img0 + 16 * 3);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-
-                    __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x4_ps(_r0, _r1, _r2, _r3);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack16to8.h b/src/layer/x86/convolution_sgemm_pack16to8.h
index ec293efb9f6..397fa0296ba 100644
--- a/src/layer/x86/convolution_sgemm_pack16to8.h
+++ b/src/layer/x86/convolution_sgemm_pack16to8.h
@@ -59,41 +59,7 @@ static void im2col_sgemm_pack16to8_avx512(const Mat& bottom_im2col, Mat& top_blo
                     __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6);
                     __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-
-                    __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
@@ -132,25 +98,7 @@ static void im2col_sgemm_pack16to8_avx512(const Mat& bottom_im2col, Mat& top_blo
                     __m512 _r2 = _mm512_loadu_ps(img0 + 16 * 2);
                     __m512 _r3 = _mm512_loadu_ps(img0 + 16 * 3);
 
-                    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-
-                    __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-
-                    _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                    _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
-
-                    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                    _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                    _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                    transpose16x4_ps(_r0, _r1, _r2, _r3);
 
                     _mm512_storeu_ps(tmpptr, _r0);
                     _mm512_storeu_ps(tmpptr + 16, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack1to4_int8.h b/src/layer/x86/convolution_sgemm_pack1to4_int8.h
index ba567ce3354..fd084987277 100644
--- a/src/layer/x86/convolution_sgemm_pack1to4_int8.h
+++ b/src/layer/x86/convolution_sgemm_pack1to4_int8.h
@@ -301,17 +301,8 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
 
             if (nn4 > 0)
             {
-#if __AVXVNNI__ || __AVX512VNNI__
-                __m256i _sum10_02 = _mm256_setzero_si256();
-                __m256i _sum30_22 = _mm256_setzero_si256();
-#else
                 __m256i _sum10_02 = _mm256_setzero_si256();
-                __m256i _sum01_13 = _mm256_setzero_si256();
-                __m256i _sum11_03 = _mm256_setzero_si256();
                 __m256i _sum30_22 = _mm256_setzero_si256();
-                __m256i _sum21_33 = _mm256_setzero_si256();
-                __m256i _sum31_23 = _mm256_setzero_si256();
-#endif
 
                 int j = 0;
                 for (; j < nn4; j++)
@@ -334,72 +325,21 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                     _sum20_32 = _mm256_dpwssd_epi32(_sum20_32, _val23_16, _w01_16);
                     _sum30_22 = _mm256_dpwssd_epi32(_sum30_22, _val32_16, _w01_16);
 #else
-                    __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                    __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                    __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                    __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-                    __m256i _sl20_31 = _mm256_mullo_epi16(_val23_16, _w01_16);
-                    __m256i _sh20_31 = _mm256_mulhi_epi16(_val23_16, _w01_16);
-                    __m256i _sl30_21 = _mm256_mullo_epi16(_val32_16, _w01_16);
-                    __m256i _sh30_21 = _mm256_mulhi_epi16(_val32_16, _w01_16);
-
-                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                    _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                    _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
-                    _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_unpacklo_epi16(_sl20_31, _sh20_31));
-                    _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_unpacklo_epi16(_sl30_21, _sh30_21));
-                    _sum21_33 = _mm256_add_epi32(_sum21_33, _mm256_unpackhi_epi16(_sl20_31, _sh20_31));
-                    _sum31_23 = _mm256_add_epi32(_sum31_23, _mm256_unpackhi_epi16(_sl30_21, _sh30_21));
+                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16));
+                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16));
+                    _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_madd_epi16(_val23_16, _w01_16));
+                    _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_madd_epi16(_val32_16, _w01_16));
 #endif
 
                     tmpptr += 16;
                     kptr0 += 16;
                 }
 
-#if __AVXVNNI__ || __AVX512VNNI__
                 _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02);
                 _sum20_32 = _mm256_hadd_epi32(_sum20_32, _sum30_22);
 
                 _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0));
                 _sum20_32 = _mm256_permute4x64_epi64(_sum20_32, _MM_SHUFFLE(2, 1, 3, 0));
-#else
-                // transpose 4x8
-                {
-                    __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02);
-                    _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03);
-                    _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02);
-                    _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03);
-                    _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                }
-                {
-                    __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm256_unpacklo_epi32(_sum20_32, _sum30_22);
-                    _tmp1 = _mm256_unpacklo_epi32(_sum21_33, _sum31_23);
-                    _tmp2 = _mm256_unpackhi_epi32(_sum20_32, _sum30_22);
-                    _tmp3 = _mm256_unpackhi_epi32(_sum21_33, _sum31_23);
-                    _sum20_32 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum30_22 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum21_33 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum31_23 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                }
-
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02);
-                _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03);
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13);
-
-                _sum20_32 = _mm256_add_epi32(_sum20_32, _sum30_22);
-                _sum21_33 = _mm256_add_epi32(_sum21_33, _sum31_23);
-                _sum20_32 = _mm256_add_epi32(_sum20_32, _sum21_33);
-
-                __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0);
-                _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask);
-                _sum20_32 = _mm256_permutevar8x32_epi32(_sum20_32, _perm_mask);
-#endif
             }
 
             __m128i _sum00 = _mm256_extracti128_si256(_sum00_12, 0);
@@ -458,25 +398,10 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
             if (nn4 > 0)
             {
 #if __AVX2__
-#if __AVXVNNI__ || __AVX512VNNI__
-                __m256i _sum10_02 = _mm256_setzero_si256();
-#else
                 __m256i _sum10_02 = _mm256_setzero_si256();
-                __m256i _sum01_13 = _mm256_setzero_si256();
-                __m256i _sum11_03 = _mm256_setzero_si256();
-#endif
-#else
-#if __XOP__
-                __m128i _sum01 = _mm_setzero_si128();
-                __m128i _sum11 = _mm_setzero_si128();
 #else
                 __m128i _sum01 = _mm_setzero_si128();
-                __m128i _sum02 = _mm_setzero_si128();
-                __m128i _sum03 = _mm_setzero_si128();
                 __m128i _sum11 = _mm_setzero_si128();
-                __m128i _sum12 = _mm_setzero_si128();
-                __m128i _sum13 = _mm_setzero_si128();
-#endif
 #endif
 
                 int j = 0;
@@ -497,15 +422,8 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                     _sum00_12 = _mm256_dpwssd_epi32(_sum00_12, _val01_16, _w01_16);
                     _sum10_02 = _mm256_dpwssd_epi32(_sum10_02, _val10_16, _w01_16);
 #else
-                    __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                    __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                    __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                    __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-
-                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                    _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                    _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
+                    _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16));
+                    _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16));
 #endif
 #else
                     __m128i _val01 = _mm_loadl_epi64((const __m128i*)tmpptr);
@@ -530,23 +448,10 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                     _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
                     _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
 #else
-                    __m128i _sl00 = _mm_mullo_epi16(_val0, _w0);
-                    __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0);
-                    __m128i _sl01 = _mm_mullo_epi16(_val0, _w1);
-                    __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1);
-                    __m128i _sl10 = _mm_mullo_epi16(_val1, _w0);
-                    __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0);
-                    __m128i _sl11 = _mm_mullo_epi16(_val1, _w1);
-                    __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1);
-
-                    _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00));
-                    _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl00, _sh00));
-                    _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl01, _sh01));
-                    _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl01, _sh01));
-                    _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10));
-                    _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl10, _sh10));
-                    _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl11, _sh11));
-                    _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl11, _sh11));
+                    _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
+                    _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
+                    _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
+                    _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
 #endif
 #endif
 
@@ -555,67 +460,26 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 }
 
 #if __AVX2__
-#if __AVXVNNI__ || __AVX512VNNI__
                 _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02);
 
                 _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0));
 #else
-                // transpose 4x8
-                {
-                    __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02);
-                    _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03);
-                    _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02);
-                    _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03);
-                    _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                }
-
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02);
-                _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03);
-                _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13);
-
-                __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0);
-                _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask);
-#endif
-#else
-#if __XOP__
+#if __SSSE3__
                 _sum00 = _mm_hadd_epi32(_sum00, _sum01);
                 _sum10 = _mm_hadd_epi32(_sum10, _sum11);
 #else
-                // transpose 4x4
-                {
-                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
-                    _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
-                    _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
-                    _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
-                    _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                }
-                {
-                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                    _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
-                    _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
-                    _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
-                    _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
-                    _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                    _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                    _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                    _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                }
+                __m128i _sum00_sh = _mm_shuffle_epi32(_sum00, 216);
+                __m128i _sum01_sh = _mm_shuffle_epi32(_sum01, 216);
+                __m128i _sum10_sh = _mm_shuffle_epi32(_sum10, 216);
+                __m128i _sum11_sh = _mm_shuffle_epi32(_sum11, 216);
+
+                _sum00 = _mm_unpacklo_epi64(_sum00_sh, _sum01_sh);
+                _sum01 = _mm_unpackhi_epi64(_sum00_sh, _sum01_sh);
+                _sum10 = _mm_unpacklo_epi64(_sum10_sh, _sum11_sh);
+                _sum11 = _mm_unpackhi_epi64(_sum10_sh, _sum11_sh);
 
                 _sum00 = _mm_add_epi32(_sum00, _sum01);
-                _sum02 = _mm_add_epi32(_sum02, _sum03);
                 _sum10 = _mm_add_epi32(_sum10, _sum11);
-                _sum12 = _mm_add_epi32(_sum12, _sum13);
-
-                _sum00 = _mm_add_epi32(_sum00, _sum02);
-                _sum10 = _mm_add_epi32(_sum10, _sum12);
 #endif
 #endif
             }
diff --git a/src/layer/x86/convolution_sgemm_pack8.h b/src/layer/x86/convolution_sgemm_pack8.h
index af5b9cddf2a..64f0dc2fa4c 100644
--- a/src/layer/x86/convolution_sgemm_pack8.h
+++ b/src/layer/x86/convolution_sgemm_pack8.h
@@ -67,42 +67,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons
                     __m256 _ra = _mm256_load_ps(img0 + 8 * 10);
                     __m256 _rb = _mm256_load_ps(img0 + 8 * 11);
 
-                    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
-                    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
-                    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
-                    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
-                    __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
-                    __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
-                    __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
-                    __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
-                    __m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
-                    _r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1));
-                    _ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
-                    _rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1));
+                    transpose8x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
@@ -149,30 +114,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons
                     __m256 _r6 = _mm256_load_ps(img0 + 8 * 6);
                     __m256 _r7 = _mm256_load_ps(img0 + 8 * 7);
 
-                    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
-                    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
-                    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
-                    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
-                    __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                    _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
+                    transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
@@ -211,18 +153,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons
                     __m256 _r2 = _mm256_load_ps(img0 + 8 * 2);
                     __m256 _r3 = _mm256_load_ps(img0 + 8 * 3);
 
-                    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                    __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+                    transpose8x4_ps(_r0, _r1, _r2, _r3);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
@@ -255,10 +186,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons
                     __m256 _r0 = _mm256_load_ps(img0);
                     __m256 _r1 = _mm256_load_ps(img0 + 8);
 
-                    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                    _r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
+                    transpose8x2_ps(_r0, _r1);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack8to1.h b/src/layer/x86/convolution_sgemm_pack8to1.h
index e23e13363f1..c554599b612 100644
--- a/src/layer/x86/convolution_sgemm_pack8to1.h
+++ b/src/layer/x86/convolution_sgemm_pack8to1.h
@@ -56,30 +56,7 @@ static void im2col_sgemm_pack8to1_avx(const Mat& bottom_im2col, Mat& top_blob, c
                     __m256 _r6 = _mm256_load_ps(img0 + 8 * 6);
                     __m256 _r7 = _mm256_load_ps(img0 + 8 * 7);
 
-                    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
-                    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
-                    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
-                    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
-                    __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                    _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
+                    transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack8to16.h b/src/layer/x86/convolution_sgemm_pack8to16.h
index 03e9633142f..9e1fbe8f348 100644
--- a/src/layer/x86/convolution_sgemm_pack8to16.h
+++ b/src/layer/x86/convolution_sgemm_pack8to16.h
@@ -57,7 +57,7 @@ static void im2col_sgemm_pack8to16_avx512(const Mat& bottom_im2col, Mat& top_blo
                     __m256 _r6 = _mm256_load_ps(img0 + 8 * 6);
                     __m256 _r7 = _mm256_load_ps(img0 + 8 * 7);
 
-                    transpose8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
+                    transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack8to1_int8.h b/src/layer/x86/convolution_sgemm_pack8to1_int8.h
index 9c080ffd5b9..b76b6e26f18 100644
--- a/src/layer/x86/convolution_sgemm_pack8to1_int8.h
+++ b/src/layer/x86/convolution_sgemm_pack8to1_int8.h
@@ -225,23 +225,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
                 _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
 #else
-                __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-                __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16);
-                __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16);
-                __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16);
-                __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16);
-
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03));
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03));
+                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
+                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
+                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
+                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
 #endif
 
                 __m128i _val23 = _mm_loadu_si128((const __m128i*)(tmpptr + 16));
@@ -254,23 +241,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23_16, _w23_16);
                 _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32_16, _w23_16);
 #else
-                __m256i _sl04_15 = _mm256_mullo_epi16(_val23_16, _w01_16);
-                __m256i _sh04_15 = _mm256_mulhi_epi16(_val23_16, _w01_16);
-                __m256i _sl14_05 = _mm256_mullo_epi16(_val32_16, _w01_16);
-                __m256i _sh14_05 = _mm256_mulhi_epi16(_val32_16, _w01_16);
-                __m256i _sl06_17 = _mm256_mullo_epi16(_val23_16, _w23_16);
-                __m256i _sh06_17 = _mm256_mulhi_epi16(_val23_16, _w23_16);
-                __m256i _sl16_07 = _mm256_mullo_epi16(_val32_16, _w23_16);
-                __m256i _sh16_07 = _mm256_mulhi_epi16(_val32_16, _w23_16);
-
-                _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpacklo_epi16(_sl04_15, _sh04_15));
-                _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpacklo_epi16(_sl14_05, _sh14_05));
-                _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpacklo_epi16(_sl06_17, _sh06_17));
-                _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpacklo_epi16(_sl16_07, _sh16_07));
-                _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpackhi_epi16(_sl04_15, _sh04_15));
-                _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpackhi_epi16(_sl14_05, _sh14_05));
-                _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpackhi_epi16(_sl06_17, _sh06_17));
-                _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpackhi_epi16(_sl16_07, _sh16_07));
+                _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23_16, _w01_16));
+                _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32_16, _w01_16));
+                _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23_16, _w23_16));
+                _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32_16, _w23_16));
 #endif
 
                 tmpptr += 32;
@@ -386,23 +360,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
                 _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
 #else
-                __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-                __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16);
-                __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16);
-                __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16);
-                __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16);
-
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03));
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03));
+                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
+                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
+                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
+                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
 #endif
 #else
                 __m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
@@ -429,39 +390,14 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
                 _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
 #else
-                __m128i _sl00 = _mm_mullo_epi16(_val0, _w0);
-                __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0);
-                __m128i _sl01 = _mm_mullo_epi16(_val0, _w1);
-                __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1);
-                __m128i _sl02 = _mm_mullo_epi16(_val0, _w2);
-                __m128i _sh02 = _mm_mulhi_epi16(_val0, _w2);
-                __m128i _sl03 = _mm_mullo_epi16(_val0, _w3);
-                __m128i _sh03 = _mm_mulhi_epi16(_val0, _w3);
-                __m128i _sl10 = _mm_mullo_epi16(_val1, _w0);
-                __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0);
-                __m128i _sl11 = _mm_mullo_epi16(_val1, _w1);
-                __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1);
-                __m128i _sl12 = _mm_mullo_epi16(_val1, _w2);
-                __m128i _sh12 = _mm_mulhi_epi16(_val1, _w2);
-                __m128i _sl13 = _mm_mullo_epi16(_val1, _w3);
-                __m128i _sh13 = _mm_mulhi_epi16(_val1, _w3);
-
-                _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00));
-                _sum01 = _mm_add_epi32(_sum01, _mm_unpacklo_epi16(_sl01, _sh01));
-                _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl02, _sh02));
-                _sum03 = _mm_add_epi32(_sum03, _mm_unpacklo_epi16(_sl03, _sh03));
-                _sum00 = _mm_add_epi32(_sum00, _mm_unpackhi_epi16(_sl00, _sh00));
-                _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl01, _sh01));
-                _sum02 = _mm_add_epi32(_sum02, _mm_unpackhi_epi16(_sl02, _sh02));
-                _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl03, _sh03));
-                _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10));
-                _sum11 = _mm_add_epi32(_sum11, _mm_unpacklo_epi16(_sl11, _sh11));
-                _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl12, _sh12));
-                _sum13 = _mm_add_epi32(_sum13, _mm_unpacklo_epi16(_sl13, _sh13));
-                _sum10 = _mm_add_epi32(_sum10, _mm_unpackhi_epi16(_sl10, _sh10));
-                _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl11, _sh11));
-                _sum12 = _mm_add_epi32(_sum12, _mm_unpackhi_epi16(_sl12, _sh12));
-                _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl13, _sh13));
+                _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
+                _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
+                _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
+                _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
+                _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
+                _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
+                _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
+                _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
 #endif
 #endif
 
@@ -582,15 +518,8 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01_16);
                 _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23_16);
 #else
-                __m256i _sl0_1 = _mm256_mullo_epi16(_valval, _w01_16);
-                __m256i _sh0_1 = _mm256_mulhi_epi16(_valval, _w01_16);
-                __m256i _sl2_3 = _mm256_mullo_epi16(_valval, _w23_16);
-                __m256i _sh2_3 = _mm256_mulhi_epi16(_valval, _w23_16);
-
-                _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl0_1, _sh0_1));
-                _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl2_3, _sh2_3));
-                _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl0_1, _sh0_1));
-                _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl2_3, _sh2_3));
+                _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01_16));
+                _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23_16));
 #endif
 #else
                 __m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr);
@@ -615,23 +544,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
                 _sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
 #else
-                __m128i _sl0 = _mm_mullo_epi16(_val, _w0);
-                __m128i _sh0 = _mm_mulhi_epi16(_val, _w0);
-                __m128i _sl1 = _mm_mullo_epi16(_val, _w1);
-                __m128i _sh1 = _mm_mulhi_epi16(_val, _w1);
-                __m128i _sl2 = _mm_mullo_epi16(_val, _w2);
-                __m128i _sh2 = _mm_mulhi_epi16(_val, _w2);
-                __m128i _sl3 = _mm_mullo_epi16(_val, _w3);
-                __m128i _sh3 = _mm_mulhi_epi16(_val, _w3);
-
-                _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                _sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1));
-                _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl2, _sh2));
-                _sum3 = _mm_add_epi32(_sum3, _mm_unpacklo_epi16(_sl3, _sh3));
-                _sum0 = _mm_add_epi32(_sum0, _mm_unpackhi_epi16(_sl0, _sh0));
-                _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1));
-                _sum2 = _mm_add_epi32(_sum2, _mm_unpackhi_epi16(_sl2, _sh2));
-                _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3));
+                _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
+                _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
+                _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
+                _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
 #endif
 #endif
 
@@ -694,10 +610,8 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
 
             int nn = inch * maxk; // inch always > 0
 
-            __m256i _sum0_2 = _mm256_setzero_si256();
-            __m256i _sum1_3 = _mm256_setzero_si256();
-            __m256i _sum4_6 = _mm256_setzero_si256();
-            __m256i _sum5_7 = _mm256_setzero_si256();
+            __m256i _sum01 = _mm256_setzero_si256();
+            __m256i _sum23 = _mm256_setzero_si256();
 
             int j = 0;
             for (; j < nn; j++)
@@ -711,31 +625,27 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 __m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
                 _w01_16 = _mm256_permute4x64_epi64(_w01_16, _MM_SHUFFLE(1, 0, 1, 0));
 
-                __m256i _sl00_10 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                __m256i _sh00_10 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                __m256i _sl20_30 = _mm256_mullo_epi16(_val23_16, _w01_16);
-                __m256i _sh20_30 = _mm256_mulhi_epi16(_val23_16, _w01_16);
-
-                _sum0_2 = _mm256_add_epi32(_sum0_2, _mm256_unpacklo_epi16(_sl00_10, _sh00_10));
-                _sum1_3 = _mm256_add_epi32(_sum1_3, _mm256_unpackhi_epi16(_sl00_10, _sh00_10));
-                _sum4_6 = _mm256_add_epi32(_sum4_6, _mm256_unpacklo_epi16(_sl20_30, _sh20_30));
-                _sum5_7 = _mm256_add_epi32(_sum5_7, _mm256_unpackhi_epi16(_sl20_30, _sh20_30));
+#if __AVXVNNI__ || __AVX512VNNI__
+                _sum01 = _mm256_dpwssd_epi32(_sum01, _val01_16, _w01_16);
+                _sum23 = _mm256_dpwssd_epi32(_sum23, _val23_16, _w01_16);
+#else
+                _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01_16, _w01_16));
+                _sum23 = _mm256_add_epi32(_sum23, _mm256_madd_epi16(_val23_16, _w01_16));
+#endif
 
                 tmpptr += 32;
                 kptr0 += 8;
             }
 
-            _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3);
-            _sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7);
-            __m128i _sum0 = _mm256_extracti128_si256(_sum0_2, 0);
-            __m128i _sum2 = _mm256_extracti128_si256(_sum0_2, 1);
-            __m128i _sum4 = _mm256_extracti128_si256(_sum4_6, 0);
-            __m128i _sum6 = _mm256_extracti128_si256(_sum4_6, 1);
+            __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
+            __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
+            __m128i _sum2 = _mm256_extracti128_si256(_sum23, 0);
+            __m128i _sum3 = _mm256_extracti128_si256(_sum23, 1);
 
             outptr0[0] = _mm_reduce_add_epi32(_sum0);
-            outptr0[1] = _mm_reduce_add_epi32(_sum2);
-            outptr0[2] = _mm_reduce_add_epi32(_sum4);
-            outptr0[3] = _mm_reduce_add_epi32(_sum6);
+            outptr0[1] = _mm_reduce_add_epi32(_sum1);
+            outptr0[2] = _mm_reduce_add_epi32(_sum2);
+            outptr0[3] = _mm_reduce_add_epi32(_sum3);
             outptr0 += 4;
         }
 #endif
@@ -751,13 +661,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
             int nn = inch * maxk; // inch always > 0
 
 #if __AVX2__
-            __m256i _sum0_2 = _mm256_setzero_si256();
-            __m256i _sum1_3 = _mm256_setzero_si256();
+            __m256i _sum01 = _mm256_setzero_si256();
 #else
             __m128i _sum0 = _mm_setzero_si128();
             __m128i _sum1 = _mm_setzero_si128();
-            __m128i _sum2 = _mm_setzero_si128();
-            __m128i _sum3 = _mm_setzero_si128();
 #endif
 
             int j = 0;
@@ -771,11 +678,11 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 __m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
                 _w01_16 = _mm256_permute4x64_epi64(_w01_16, _MM_SHUFFLE(1, 0, 1, 0));
 
-                __m256i _sl00_10 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                __m256i _sh00_10 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-
-                _sum0_2 = _mm256_add_epi32(_sum0_2, _mm256_unpacklo_epi16(_sl00_10, _sh00_10));
-                _sum1_3 = _mm256_add_epi32(_sum1_3, _mm256_unpackhi_epi16(_sl00_10, _sh00_10));
+#if __AVXVNNI__ || __AVX512VNNI__
+                _sum01 = _mm256_dpwssd_epi32(_sum01, _val01_16, _w01_16);
+#else
+                _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01_16, _w01_16));
+#endif
 #else
                 __m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
                 __m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01);
@@ -790,15 +697,13 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
 #endif
 
-                __m128i _sl00 = _mm_mullo_epi16(_val0, _w0);
-                __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0);
-                __m128i _sl10 = _mm_mullo_epi16(_val1, _w0);
-                __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0);
-
-                _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00));
-                _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00));
-                _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl10, _sh10));
-                _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl10, _sh10));
+#if __XOP__
+                _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
+                _sum1 = _mm_maddd_epi16(_val1, _w0, _sum1);
+#else
+                _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
+                _sum1 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum1);
+#endif
 #endif
 
                 tmpptr += 16;
@@ -806,16 +711,12 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
             }
 
 #if __AVX2__
-            _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3);
-            __m128i _sum0 = _mm256_extracti128_si256(_sum0_2, 0);
-            __m128i _sum2 = _mm256_extracti128_si256(_sum0_2, 1);
-#else
-            _sum0 = _mm_add_epi32(_sum0, _sum1);
-            _sum2 = _mm_add_epi32(_sum2, _sum3);
+            __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
+            __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
 #endif
 
             outptr0[0] = _mm_reduce_add_epi32(_sum0);
-            outptr0[1] = _mm_reduce_add_epi32(_sum2);
+            outptr0[1] = _mm_reduce_add_epi32(_sum1);
             outptr0 += 2;
         }
         for (; i < size; i++)
@@ -830,7 +731,6 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
             int nn = inch * maxk; // inch always > 0
 
             __m128i _sum0 = _mm_setzero_si128();
-            __m128i _sum1 = _mm_setzero_si128();
 
             int j = 0;
             for (; j < nn; j++)
@@ -851,18 +751,16 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
 #endif
 
-                __m128i _sl00 = _mm_mullo_epi16(_val0, _w0);
-                __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0);
-
-                _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00));
-                _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00));
+#if __XOP__
+                _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
+#else
+                _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
+#endif
 
                 tmpptr += 8;
                 kptr0 += 8;
             }
 
-            _sum0 = _mm_add_epi32(_sum0, _sum1);
-
             outptr0[0] = _mm_reduce_add_epi32(_sum0);
             outptr0 += 1;
         }
diff --git a/src/layer/x86/convolution_sgemm_pack8to4.h b/src/layer/x86/convolution_sgemm_pack8to4.h
index 5fec80fea32..a98c816c5de 100644
--- a/src/layer/x86/convolution_sgemm_pack8to4.h
+++ b/src/layer/x86/convolution_sgemm_pack8to4.h
@@ -59,30 +59,7 @@ static void im2col_sgemm_pack8to4_avx(const Mat& bottom_im2col, Mat& top_blob, c
                     __m256 _r6 = _mm256_load_ps(img0 + 8 * 6);
                     __m256 _r7 = _mm256_load_ps(img0 + 8 * 7);
 
-                    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
-                    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
-                    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
-                    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
-                    __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                    _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
+                    transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
@@ -121,18 +98,7 @@ static void im2col_sgemm_pack8to4_avx(const Mat& bottom_im2col, Mat& top_blob, c
                     __m256 _r2 = _mm256_load_ps(img0 + 8 * 2);
                     __m256 _r3 = _mm256_load_ps(img0 + 8 * 3);
 
-                    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                    __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                    __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                    __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                    _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
-                    _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
-                    _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+                    transpose8x4_ps(_r0, _r1, _r2, _r3);
 
                     _mm256_store_ps(tmpptr, _r0);
                     _mm256_store_ps(tmpptr + 8, _r1);
diff --git a/src/layer/x86/convolution_sgemm_pack8to4_int8.h b/src/layer/x86/convolution_sgemm_pack8to4_int8.h
index 5e650c65405..8fdaece9652 100644
--- a/src/layer/x86/convolution_sgemm_pack8to4_int8.h
+++ b/src/layer/x86/convolution_sgemm_pack8to4_int8.h
@@ -215,23 +215,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
                 _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
 #else
-                __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-                __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16);
-                __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16);
-                __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16);
-                __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16);
-
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03));
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03));
+                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
+                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
+                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
+                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
 #endif
 
                 __m128i _val23 = _mm_loadu_si128((const __m128i*)(tmpptr + 16));
@@ -244,23 +231,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23_16, _w23_16);
                 _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32_16, _w23_16);
 #else
-                __m256i _sl04_15 = _mm256_mullo_epi16(_val23_16, _w01_16);
-                __m256i _sh04_15 = _mm256_mulhi_epi16(_val23_16, _w01_16);
-                __m256i _sl14_05 = _mm256_mullo_epi16(_val32_16, _w01_16);
-                __m256i _sh14_05 = _mm256_mulhi_epi16(_val32_16, _w01_16);
-                __m256i _sl06_17 = _mm256_mullo_epi16(_val23_16, _w23_16);
-                __m256i _sh06_17 = _mm256_mulhi_epi16(_val23_16, _w23_16);
-                __m256i _sl16_07 = _mm256_mullo_epi16(_val32_16, _w23_16);
-                __m256i _sh16_07 = _mm256_mulhi_epi16(_val32_16, _w23_16);
-
-                _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpacklo_epi16(_sl04_15, _sh04_15));
-                _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpacklo_epi16(_sl14_05, _sh14_05));
-                _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpacklo_epi16(_sl06_17, _sh06_17));
-                _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpacklo_epi16(_sl16_07, _sh16_07));
-                _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpackhi_epi16(_sl04_15, _sh04_15));
-                _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpackhi_epi16(_sl14_05, _sh14_05));
-                _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpackhi_epi16(_sl06_17, _sh06_17));
-                _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpackhi_epi16(_sl16_07, _sh16_07));
+                _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23_16, _w01_16));
+                _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32_16, _w01_16));
+                _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23_16, _w23_16));
+                _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32_16, _w23_16));
 #endif
 
                 tmpptr += 32;
@@ -355,23 +329,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
                 _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
 #else
-                __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16);
-                __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16);
-                __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16);
-                __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16);
-                __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16);
-                __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16);
-                __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16);
-                __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16);
-
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03));
-                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11));
-                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01));
-                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13));
-                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03));
+                _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
+                _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
+                _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
+                _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
 #endif
 #else
                 __m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
@@ -398,39 +359,14 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
                 _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
 #else
-                __m128i _sl00 = _mm_mullo_epi16(_val0, _w0);
-                __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0);
-                __m128i _sl01 = _mm_mullo_epi16(_val0, _w1);
-                __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1);
-                __m128i _sl02 = _mm_mullo_epi16(_val0, _w2);
-                __m128i _sh02 = _mm_mulhi_epi16(_val0, _w2);
-                __m128i _sl03 = _mm_mullo_epi16(_val0, _w3);
-                __m128i _sh03 = _mm_mulhi_epi16(_val0, _w3);
-                __m128i _sl10 = _mm_mullo_epi16(_val1, _w0);
-                __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0);
-                __m128i _sl11 = _mm_mullo_epi16(_val1, _w1);
-                __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1);
-                __m128i _sl12 = _mm_mullo_epi16(_val1, _w2);
-                __m128i _sh12 = _mm_mulhi_epi16(_val1, _w2);
-                __m128i _sl13 = _mm_mullo_epi16(_val1, _w3);
-                __m128i _sh13 = _mm_mulhi_epi16(_val1, _w3);
-
-                _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00));
-                _sum01 = _mm_add_epi32(_sum01, _mm_unpacklo_epi16(_sl01, _sh01));
-                _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl02, _sh02));
-                _sum03 = _mm_add_epi32(_sum03, _mm_unpacklo_epi16(_sl03, _sh03));
-                _sum00 = _mm_add_epi32(_sum00, _mm_unpackhi_epi16(_sl00, _sh00));
-                _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl01, _sh01));
-                _sum02 = _mm_add_epi32(_sum02, _mm_unpackhi_epi16(_sl02, _sh02));
-                _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl03, _sh03));
-                _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10));
-                _sum11 = _mm_add_epi32(_sum11, _mm_unpacklo_epi16(_sl11, _sh11));
-                _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl12, _sh12));
-                _sum13 = _mm_add_epi32(_sum13, _mm_unpacklo_epi16(_sl13, _sh13));
-                _sum10 = _mm_add_epi32(_sum10, _mm_unpackhi_epi16(_sl10, _sh10));
-                _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl11, _sh11));
-                _sum12 = _mm_add_epi32(_sum12, _mm_unpackhi_epi16(_sl12, _sh12));
-                _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl13, _sh13));
+                _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
+                _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
+                _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
+                _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
+                _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
+                _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
+                _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
+                _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
 #endif
 #endif
 
@@ -537,15 +473,8 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01_16);
                 _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23_16);
 #else
-                __m256i _sl0_1 = _mm256_mullo_epi16(_valval, _w01_16);
-                __m256i _sh0_1 = _mm256_mulhi_epi16(_valval, _w01_16);
-                __m256i _sl2_3 = _mm256_mullo_epi16(_valval, _w23_16);
-                __m256i _sh2_3 = _mm256_mulhi_epi16(_valval, _w23_16);
-
-                _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl0_1, _sh0_1));
-                _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl2_3, _sh2_3));
-                _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl0_1, _sh0_1));
-                _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl2_3, _sh2_3));
+                _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01_16));
+                _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23_16));
 #endif
 #else
                 __m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr);
@@ -570,23 +499,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
                 _sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
                 _sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
 #else
-                __m128i _sl0 = _mm_mullo_epi16(_val, _w0);
-                __m128i _sh0 = _mm_mulhi_epi16(_val, _w0);
-                __m128i _sl1 = _mm_mullo_epi16(_val, _w1);
-                __m128i _sh1 = _mm_mulhi_epi16(_val, _w1);
-                __m128i _sl2 = _mm_mullo_epi16(_val, _w2);
-                __m128i _sh2 = _mm_mulhi_epi16(_val, _w2);
-                __m128i _sl3 = _mm_mullo_epi16(_val, _w3);
-                __m128i _sh3 = _mm_mulhi_epi16(_val, _w3);
-
-                _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
-                _sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1));
-                _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl2, _sh2));
-                _sum3 = _mm_add_epi32(_sum3, _mm_unpacklo_epi16(_sl3, _sh3));
-                _sum0 = _mm_add_epi32(_sum0, _mm_unpackhi_epi16(_sl0, _sh0));
-                _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1));
-                _sum2 = _mm_add_epi32(_sum2, _mm_unpackhi_epi16(_sl2, _sh2));
-                _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3));
+                _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
+                _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
+                _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
+                _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
 #endif
 #endif
 
diff --git a/src/layer/x86/convolution_winograd_dot_pack16.h b/src/layer/x86/convolution_winograd_dot_pack16.h
index acbd3191e43..fec99b1bb4c 100644
--- a/src/layer/x86/convolution_winograd_dot_pack16.h
+++ b/src/layer/x86/convolution_winograd_dot_pack16.h
@@ -65,57 +65,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc
                 __m512 _ra = _mm512_load_ps(r0 + 16 * 10);
                 __m512 _rb = _mm512_load_ps(r0 + 16 * 11);
 
-                __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-                __m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9);
-                __m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9);
-                __m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb);
-                __m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb);
-
-                __m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
-
-                _tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1));
-
-                _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                _r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0));
-                _r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0));
-                _r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                _r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                _r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                _r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
-                _ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1));
-                _rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1));
+                transpose16x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
 
                 _mm512_store_ps(tmpptr, _r0);
                 _mm512_store_ps(tmpptr + 16, _r1);
@@ -154,41 +104,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc
                 __m512 _r6 = _mm512_load_ps(r0 + 16 * 6);
                 __m512 _r7 = _mm512_load_ps(r0 + 16 * 7);
 
-                __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-                __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
-                __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
-                __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
-                __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
-
-                __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-
-                _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-
-                _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                 _mm512_store_ps(tmpptr, _r0);
                 _mm512_store_ps(tmpptr + 16, _r1);
@@ -219,25 +135,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc
                 __m512 _r2 = _mm512_load_ps(r0 + 16 * 2);
                 __m512 _r3 = _mm512_load_ps(r0 + 16 * 3);
 
-                __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-                __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
-                __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
-
-                __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-
-                _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
-
-                _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                transpose16x4_ps(_r0, _r1, _r2, _r3);
 
                 _mm512_store_ps(tmpptr, _r0);
                 _mm512_store_ps(tmpptr + 16, _r1);
@@ -262,14 +160,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc
                 __m512 _r0 = _mm512_load_ps(r0);
                 __m512 _r1 = _mm512_load_ps(r0 + 16);
 
-                __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
-                __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
-
-                __m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                __m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-
-                _r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                transpose16x2_ps(_r0, _r1);
 
                 _mm512_store_ps(tmpptr, _r0);
                 _mm512_store_ps(tmpptr + 16, _r1);
diff --git a/src/layer/x86/convolution_winograd_dot_pack8.h b/src/layer/x86/convolution_winograd_dot_pack8.h
index eaa56e30ade..2855ca23cab 100644
--- a/src/layer/x86/convolution_winograd_dot_pack8.h
+++ b/src/layer/x86/convolution_winograd_dot_pack8.h
@@ -65,42 +65,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c
                 __m256 _ra = _mm256_load_ps(r0 + 8 * 10);
                 __m256 _rb = _mm256_load_ps(r0 + 8 * 11);
 
-                __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
-                __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
-                __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
-                __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
-                __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
-                __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
-                __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
-                __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
-                __m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
-                _r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0));
-                _r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
-                _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0));
-                _r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0));
-                _r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
-                _r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
-                _r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1));
-                _r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
-                _r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1));
-                _r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1));
-                _ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
-                _rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1));
+                transpose8x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
 
                 _mm256_store_ps(tmpptr, _r0);
                 _mm256_store_ps(tmpptr + 8, _r1);
@@ -139,30 +104,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c
                 __m256 _r6 = _mm256_load_ps(r0 + 8 * 6);
                 __m256 _r7 = _mm256_load_ps(r0 + 8 * 7);
 
-                __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
-                __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
-                __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
-                __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
-                __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-                _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
-                _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
-                _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
-                _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
-                _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
-                _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
-                _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
-                _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
+                transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
 
                 _mm256_store_ps(tmpptr, _r0);
                 _mm256_store_ps(tmpptr + 8, _r1);
@@ -193,18 +135,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c
                 __m256 _r2 = _mm256_load_ps(r0 + 8 * 2);
                 __m256 _r3 = _mm256_load_ps(r0 + 8 * 3);
 
-                __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
-                __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
-                __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
-                _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
-                _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
-                _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+                transpose8x4_ps(_r0, _r1, _r2, _r3);
 
                 _mm256_store_ps(tmpptr, _r0);
                 _mm256_store_ps(tmpptr + 8, _r1);
@@ -229,10 +160,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c
                 __m256 _r0 = _mm256_load_ps(r0);
                 __m256 _r1 = _mm256_load_ps(r0 + 8);
 
-                __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
-                __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
-                _r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
-                _r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
+                transpose8x2_ps(_r0, _r1);
 
                 _mm256_store_ps(tmpptr, _r0);
                 _mm256_store_ps(tmpptr + 8, _r1);
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 60522b04eaa..9acceb28854 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -16,12 +16,15 @@
 
 #if __SSE2__
 #include <emmintrin.h>
+#if __SSSE3__
+#include <tmmintrin.h>
 #if __SSE4_1__
 #include <smmintrin.h>
 #if __AVX__
 #include <immintrin.h>
 #endif
 #endif // __SSE4_1__
+#endif // __SSSE3__
 #endif // __SSE2__
 #include "x86_activation.h"
 #include "x86_usability.h"
diff --git a/src/layer/x86/deformableconv2d_pack16.h b/src/layer/x86/deformableconv2d_pack16.h
new file mode 100644
index 00000000000..42f260f6e96
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack16.h
@@ -0,0 +1,435 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 16;
+    const int out_elempack = 16;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m512 _sum = _mm512_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr);
+                            __m512 _val_channel1 = _val_channel0;
+                            __m512 _val_channel2 = _val_channel0;
+                            __m512 _val_channel3 = _val_channel0;
+                            __m512 _val_channel4 = _val_channel0;
+                            __m512 _val_channel5 = _val_channel0;
+                            __m512 _val_channel6 = _val_channel0;
+                            __m512 _val_channel7 = _val_channel0;
+                            __m512 _val_channel8 = _val_channel0;
+                            __m512 _val_channel9 = _val_channel0;
+                            __m512 _val_channela = _val_channel0;
+                            __m512 _val_channelb = _val_channel0;
+                            __m512 _val_channelc = _val_channel0;
+                            __m512 _val_channeld = _val_channel0;
+                            __m512 _val_channele = _val_channel0;
+                            __m512 _val_channelf = _val_channel0;
+                            if (cond)
+                            {
+                                __m512 _v1_channel0 = _val_channel0;
+                                __m512 _v1_channel1 = _val_channel0;
+                                __m512 _v1_channel2 = _val_channel0;
+                                __m512 _v1_channel3 = _val_channel0;
+                                __m512 _v1_channel4 = _val_channel0;
+                                __m512 _v1_channel5 = _val_channel0;
+                                __m512 _v1_channel6 = _val_channel0;
+                                __m512 _v1_channel7 = _val_channel0;
+                                __m512 _v1_channel8 = _val_channel0;
+                                __m512 _v1_channel9 = _val_channel0;
+                                __m512 _v1_channela = _val_channel0;
+                                __m512 _v1_channelb = _val_channel0;
+                                __m512 _v1_channelc = _val_channel0;
+                                __m512 _v1_channeld = _val_channel0;
+                                __m512 _v1_channele = _val_channel0;
+                                __m512 _v1_channelf = _val_channel0;
+                                __m512 _v2_channel0 = _val_channel0;
+                                __m512 _v2_channel1 = _val_channel0;
+                                __m512 _v2_channel2 = _val_channel0;
+                                __m512 _v2_channel3 = _val_channel0;
+                                __m512 _v2_channel4 = _val_channel0;
+                                __m512 _v2_channel5 = _val_channel0;
+                                __m512 _v2_channel6 = _val_channel0;
+                                __m512 _v2_channel7 = _val_channel0;
+                                __m512 _v2_channel8 = _val_channel0;
+                                __m512 _v2_channel9 = _val_channel0;
+                                __m512 _v2_channela = _val_channel0;
+                                __m512 _v2_channelb = _val_channel0;
+                                __m512 _v2_channelc = _val_channel0;
+                                __m512 _v2_channeld = _val_channel0;
+                                __m512 _v2_channele = _val_channel0;
+                                __m512 _v2_channelf = _val_channel0;
+                                __m512 _v3_channel0 = _val_channel0;
+                                __m512 _v3_channel1 = _val_channel0;
+                                __m512 _v3_channel2 = _val_channel0;
+                                __m512 _v3_channel3 = _val_channel0;
+                                __m512 _v3_channel4 = _val_channel0;
+                                __m512 _v3_channel5 = _val_channel0;
+                                __m512 _v3_channel6 = _val_channel0;
+                                __m512 _v3_channel7 = _val_channel0;
+                                __m512 _v3_channel8 = _val_channel0;
+                                __m512 _v3_channel9 = _val_channel0;
+                                __m512 _v3_channela = _val_channel0;
+                                __m512 _v3_channelb = _val_channel0;
+                                __m512 _v3_channelc = _val_channel0;
+                                __m512 _v3_channeld = _val_channel0;
+                                __m512 _v3_channele = _val_channel0;
+                                __m512 _v3_channelf = _val_channel0;
+                                __m512 _v4_channel0 = _val_channel0;
+                                __m512 _v4_channel1 = _val_channel0;
+                                __m512 _v4_channel2 = _val_channel0;
+                                __m512 _v4_channel3 = _val_channel0;
+                                __m512 _v4_channel4 = _val_channel0;
+                                __m512 _v4_channel5 = _val_channel0;
+                                __m512 _v4_channel6 = _val_channel0;
+                                __m512 _v4_channel7 = _val_channel0;
+                                __m512 _v4_channel8 = _val_channel0;
+                                __m512 _v4_channel9 = _val_channel0;
+                                __m512 _v4_channela = _val_channel0;
+                                __m512 _v4_channelb = _val_channel0;
+                                __m512 _v4_channelc = _val_channel0;
+                                __m512 _v4_channeld = _val_channel0;
+                                __m512 _v4_channele = _val_channel0;
+                                __m512 _v4_channelf = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]);
+                                    _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]);
+                                    _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]);
+                                    _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]);
+                                    _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]);
+                                    _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]);
+                                    _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]);
+                                    _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]);
+                                    _v1_channel8 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 8]);
+                                    _v1_channel9 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 9]);
+                                    _v1_channela = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 10]);
+                                    _v1_channelb = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 11]);
+                                    _v1_channelc = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 12]);
+                                    _v1_channeld = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 13]);
+                                    _v1_channele = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 14]);
+                                    _v1_channelf = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 15]);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]);
+                                    _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]);
+                                    _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]);
+                                    _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]);
+                                    _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]);
+                                    _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]);
+                                    _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]);
+                                    _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]);
+                                    _v2_channel8 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 8]);
+                                    _v2_channel9 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 9]);
+                                    _v2_channela = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 10]);
+                                    _v2_channelb = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 11]);
+                                    _v2_channelc = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 12]);
+                                    _v2_channeld = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 13]);
+                                    _v2_channele = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 14]);
+                                    _v2_channelf = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 15]);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]);
+                                    _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]);
+                                    _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]);
+                                    _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]);
+                                    _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]);
+                                    _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]);
+                                    _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]);
+                                    _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]);
+                                    _v3_channel8 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 8]);
+                                    _v3_channel9 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 9]);
+                                    _v3_channela = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 10]);
+                                    _v3_channelb = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 11]);
+                                    _v3_channelc = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 12]);
+                                    _v3_channeld = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 13]);
+                                    _v3_channele = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 14]);
+                                    _v3_channelf = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 15]);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]);
+                                    _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]);
+                                    _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]);
+                                    _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]);
+                                    _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]);
+                                    _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]);
+                                    _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]);
+                                    _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]);
+                                    _v4_channel8 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 8]);
+                                    _v4_channel9 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 9]);
+                                    _v4_channela = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 10]);
+                                    _v4_channelb = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 11]);
+                                    _v4_channelc = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 12]);
+                                    _v4_channeld = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 13]);
+                                    _v4_channele = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 14]);
+                                    _v4_channelf = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 15]);
+                                }
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                                _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4);
+                                _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4);
+                                _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4);
+                                _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4);
+                                _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5);
+                                _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5);
+                                _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5);
+                                _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5);
+                                _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6);
+                                _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6);
+                                _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6);
+                                _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6);
+                                _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7);
+                                _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7);
+                                _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7);
+                                _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7);
+                                _val_channel8 = _mm512_fmadd_ps(_v1_channel8, _w1, _val_channel8);
+                                _val_channel8 = _mm512_fmadd_ps(_v2_channel8, _w2, _val_channel8);
+                                _val_channel8 = _mm512_fmadd_ps(_v3_channel8, _w3, _val_channel8);
+                                _val_channel8 = _mm512_fmadd_ps(_v4_channel8, _w4, _val_channel8);
+                                _val_channel9 = _mm512_fmadd_ps(_v1_channel9, _w1, _val_channel9);
+                                _val_channel9 = _mm512_fmadd_ps(_v2_channel9, _w2, _val_channel9);
+                                _val_channel9 = _mm512_fmadd_ps(_v3_channel9, _w3, _val_channel9);
+                                _val_channel9 = _mm512_fmadd_ps(_v4_channel9, _w4, _val_channel9);
+                                _val_channela = _mm512_fmadd_ps(_v1_channela, _w1, _val_channela);
+                                _val_channela = _mm512_fmadd_ps(_v2_channela, _w2, _val_channela);
+                                _val_channela = _mm512_fmadd_ps(_v3_channela, _w3, _val_channela);
+                                _val_channela = _mm512_fmadd_ps(_v4_channela, _w4, _val_channela);
+                                _val_channelb = _mm512_fmadd_ps(_v1_channelb, _w1, _val_channelb);
+                                _val_channelb = _mm512_fmadd_ps(_v2_channelb, _w2, _val_channelb);
+                                _val_channelb = _mm512_fmadd_ps(_v3_channelb, _w3, _val_channelb);
+                                _val_channelb = _mm512_fmadd_ps(_v4_channelb, _w4, _val_channelb);
+                                _val_channelc = _mm512_fmadd_ps(_v1_channelc, _w1, _val_channelc);
+                                _val_channelc = _mm512_fmadd_ps(_v2_channelc, _w2, _val_channelc);
+                                _val_channelc = _mm512_fmadd_ps(_v3_channelc, _w3, _val_channelc);
+                                _val_channelc = _mm512_fmadd_ps(_v4_channelc, _w4, _val_channelc);
+                                _val_channeld = _mm512_fmadd_ps(_v1_channeld, _w1, _val_channeld);
+                                _val_channeld = _mm512_fmadd_ps(_v2_channeld, _w2, _val_channeld);
+                                _val_channeld = _mm512_fmadd_ps(_v3_channeld, _w3, _val_channeld);
+                                _val_channeld = _mm512_fmadd_ps(_v4_channeld, _w4, _val_channeld);
+                                _val_channele = _mm512_fmadd_ps(_v1_channele, _w1, _val_channele);
+                                _val_channele = _mm512_fmadd_ps(_v2_channele, _w2, _val_channele);
+                                _val_channele = _mm512_fmadd_ps(_v3_channele, _w3, _val_channele);
+                                _val_channele = _mm512_fmadd_ps(_v4_channele, _w4, _val_channele);
+                                _val_channelf = _mm512_fmadd_ps(_v1_channelf, _w1, _val_channelf);
+                                _val_channelf = _mm512_fmadd_ps(_v2_channelf, _w2, _val_channelf);
+                                _val_channelf = _mm512_fmadd_ps(_v3_channelf, _w3, _val_channelf);
+                                _val_channelf = _mm512_fmadd_ps(_v4_channelf, _w4, _val_channelf);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm512_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm512_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm512_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm512_mul_ps(_val_channel3, _mask);
+                                _val_channel4 = _mm512_mul_ps(_val_channel4, _mask);
+                                _val_channel5 = _mm512_mul_ps(_val_channel5, _mask);
+                                _val_channel6 = _mm512_mul_ps(_val_channel6, _mask);
+                                _val_channel7 = _mm512_mul_ps(_val_channel7, _mask);
+                                _val_channel8 = _mm512_mul_ps(_val_channel8, _mask);
+                                _val_channel9 = _mm512_mul_ps(_val_channel9, _mask);
+                                _val_channela = _mm512_mul_ps(_val_channela, _mask);
+                                _val_channelb = _mm512_mul_ps(_val_channelb, _mask);
+                                _val_channelc = _mm512_mul_ps(_val_channelc, _mask);
+                                _val_channeld = _mm512_mul_ps(_val_channeld, _mask);
+                                _val_channele = _mm512_mul_ps(_val_channele, _mask);
+                                _val_channelf = _mm512_mul_ps(_val_channelf, _mask);
+                            }
+                            __m512 _conv_w0 = _mm512_load_ps(kptr);
+                            __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack
+                            __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack
+                            __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum);
+                            __m512 _conv_w6 = _mm512_load_ps(kptr + 96);  // 6 * out_elempack
+                            __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum);
+                            __m512 _conv_w8 = _mm512_load_ps(kptr + 128); // 8 * out_elempack
+                            __m512 _conv_w9 = _mm512_load_ps(kptr + 144); // 9 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel8, _conv_w8, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel9, _conv_w9, _sum);
+                            __m512 _conv_wa = _mm512_load_ps(kptr + 160); // 10 * out_elempack
+                            __m512 _conv_wb = _mm512_load_ps(kptr + 176); // 11 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channela, _conv_wa, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channelb, _conv_wb, _sum);
+                            __m512 _conv_wc = _mm512_load_ps(kptr + 192); // 12 * out_elempack
+                            __m512 _conv_wd = _mm512_load_ps(kptr + 208); // 13 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channelc, _conv_wc, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channeld, _conv_wd, _sum);
+                            __m512 _conv_we = _mm512_load_ps(kptr + 224); // 14 * out_elempack
+                            __m512 _conv_wf = _mm512_load_ps(kptr + 240); // 15 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channele, _conv_we, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channelf, _conv_wf, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx512(_sum, activation_type, activation_params);
+                _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack16to1.h b/src/layer/x86/deformableconv2d_pack16to1.h
new file mode 100644
index 00000000000..c721f5c5233
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack16to1.h
@@ -0,0 +1,370 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack16to1_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 16;
+    const int out_elempack = 1;
+    const int wstep = out_elempack * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                float _sum = 0.f;
+                if (bias_data_ptr)
+                    _sum = *(bias_data_ptr + oc);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            float _val_channel0 = 0.f;
+                            float _val_channel1 = _val_channel0;
+                            float _val_channel2 = _val_channel0;
+                            float _val_channel3 = _val_channel0;
+                            float _val_channel4 = _val_channel0;
+                            float _val_channel5 = _val_channel0;
+                            float _val_channel6 = _val_channel0;
+                            float _val_channel7 = _val_channel0;
+                            float _val_channel8 = _val_channel0;
+                            float _val_channel9 = _val_channel0;
+                            float _val_channela = _val_channel0;
+                            float _val_channelb = _val_channel0;
+                            float _val_channelc = _val_channel0;
+                            float _val_channeld = _val_channel0;
+                            float _val_channele = _val_channel0;
+                            float _val_channelf = _val_channel0;
+                            if (cond)
+                            {
+                                float _v1_channel0 = _val_channel0;
+                                float _v1_channel1 = _val_channel0;
+                                float _v1_channel2 = _val_channel0;
+                                float _v1_channel3 = _val_channel0;
+                                float _v1_channel4 = _val_channel0;
+                                float _v1_channel5 = _val_channel0;
+                                float _v1_channel6 = _val_channel0;
+                                float _v1_channel7 = _val_channel0;
+                                float _v1_channel8 = _val_channel0;
+                                float _v1_channel9 = _val_channel0;
+                                float _v1_channela = _val_channel0;
+                                float _v1_channelb = _val_channel0;
+                                float _v1_channelc = _val_channel0;
+                                float _v1_channeld = _val_channel0;
+                                float _v1_channele = _val_channel0;
+                                float _v1_channelf = _val_channel0;
+                                float _v2_channel0 = _val_channel0;
+                                float _v2_channel1 = _val_channel0;
+                                float _v2_channel2 = _val_channel0;
+                                float _v2_channel3 = _val_channel0;
+                                float _v2_channel4 = _val_channel0;
+                                float _v2_channel5 = _val_channel0;
+                                float _v2_channel6 = _val_channel0;
+                                float _v2_channel7 = _val_channel0;
+                                float _v2_channel8 = _val_channel0;
+                                float _v2_channel9 = _val_channel0;
+                                float _v2_channela = _val_channel0;
+                                float _v2_channelb = _val_channel0;
+                                float _v2_channelc = _val_channel0;
+                                float _v2_channeld = _val_channel0;
+                                float _v2_channele = _val_channel0;
+                                float _v2_channelf = _val_channel0;
+                                float _v3_channel0 = _val_channel0;
+                                float _v3_channel1 = _val_channel0;
+                                float _v3_channel2 = _val_channel0;
+                                float _v3_channel3 = _val_channel0;
+                                float _v3_channel4 = _val_channel0;
+                                float _v3_channel5 = _val_channel0;
+                                float _v3_channel6 = _val_channel0;
+                                float _v3_channel7 = _val_channel0;
+                                float _v3_channel8 = _val_channel0;
+                                float _v3_channel9 = _val_channel0;
+                                float _v3_channela = _val_channel0;
+                                float _v3_channelb = _val_channel0;
+                                float _v3_channelc = _val_channel0;
+                                float _v3_channeld = _val_channel0;
+                                float _v3_channele = _val_channel0;
+                                float _v3_channelf = _val_channel0;
+                                float _v4_channel0 = _val_channel0;
+                                float _v4_channel1 = _val_channel0;
+                                float _v4_channel2 = _val_channel0;
+                                float _v4_channel3 = _val_channel0;
+                                float _v4_channel4 = _val_channel0;
+                                float _v4_channel5 = _val_channel0;
+                                float _v4_channel6 = _val_channel0;
+                                float _v4_channel7 = _val_channel0;
+                                float _v4_channel8 = _val_channel0;
+                                float _v4_channel9 = _val_channel0;
+                                float _v4_channela = _val_channel0;
+                                float _v4_channelb = _val_channel0;
+                                float _v4_channelc = _val_channel0;
+                                float _v4_channeld = _val_channel0;
+                                float _v4_channele = _val_channel0;
+                                float _v4_channelf = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = *(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3);
+                                    _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4);
+                                    _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5);
+                                    _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6);
+                                    _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7);
+                                    _v1_channel8 = *(data_im_ptr + v1_pos * elempack + 8);
+                                    _v1_channel9 = *(data_im_ptr + v1_pos * elempack + 9);
+                                    _v1_channela = *(data_im_ptr + v1_pos * elempack + 10);
+                                    _v1_channelb = *(data_im_ptr + v1_pos * elempack + 11);
+                                    _v1_channelc = *(data_im_ptr + v1_pos * elempack + 12);
+                                    _v1_channeld = *(data_im_ptr + v1_pos * elempack + 13);
+                                    _v1_channele = *(data_im_ptr + v1_pos * elempack + 14);
+                                    _v1_channelf = *(data_im_ptr + v1_pos * elempack + 15);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = *(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3);
+                                    _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4);
+                                    _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5);
+                                    _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6);
+                                    _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7);
+                                    _v2_channel8 = *(data_im_ptr + v2_pos * elempack + 8);
+                                    _v2_channel9 = *(data_im_ptr + v2_pos * elempack + 9);
+                                    _v2_channela = *(data_im_ptr + v2_pos * elempack + 10);
+                                    _v2_channelb = *(data_im_ptr + v2_pos * elempack + 11);
+                                    _v2_channelc = *(data_im_ptr + v2_pos * elempack + 12);
+                                    _v2_channeld = *(data_im_ptr + v2_pos * elempack + 13);
+                                    _v2_channele = *(data_im_ptr + v2_pos * elempack + 14);
+                                    _v2_channelf = *(data_im_ptr + v2_pos * elempack + 15);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = *(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3);
+                                    _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4);
+                                    _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5);
+                                    _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6);
+                                    _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7);
+                                    _v3_channel8 = *(data_im_ptr + v3_pos * elempack + 8);
+                                    _v3_channel9 = *(data_im_ptr + v3_pos * elempack + 9);
+                                    _v3_channela = *(data_im_ptr + v3_pos * elempack + 10);
+                                    _v3_channelb = *(data_im_ptr + v3_pos * elempack + 11);
+                                    _v3_channelc = *(data_im_ptr + v3_pos * elempack + 12);
+                                    _v3_channeld = *(data_im_ptr + v3_pos * elempack + 13);
+                                    _v3_channele = *(data_im_ptr + v3_pos * elempack + 14);
+                                    _v3_channelf = *(data_im_ptr + v3_pos * elempack + 15);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = *(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3);
+                                    _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4);
+                                    _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5);
+                                    _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6);
+                                    _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7);
+                                    _v4_channel8 = *(data_im_ptr + v4_pos * elempack + 8);
+                                    _v4_channel9 = *(data_im_ptr + v4_pos * elempack + 9);
+                                    _v4_channela = *(data_im_ptr + v4_pos * elempack + 10);
+                                    _v4_channelb = *(data_im_ptr + v4_pos * elempack + 11);
+                                    _v4_channelc = *(data_im_ptr + v4_pos * elempack + 12);
+                                    _v4_channeld = *(data_im_ptr + v4_pos * elempack + 13);
+                                    _v4_channele = *(data_im_ptr + v4_pos * elempack + 14);
+                                    _v4_channelf = *(data_im_ptr + v4_pos * elempack + 15);
+                                }
+                                _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0;
+                                _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1;
+                                _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2;
+                                _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3;
+                                _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4;
+                                _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5;
+                                _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6;
+                                _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7;
+                                _val_channel8 = w1 * _v1_channel8 + w2 * _v2_channel8 + w3 * _v3_channel8 + w4 * _v4_channel8;
+                                _val_channel9 = w1 * _v1_channel9 + w2 * _v2_channel9 + w3 * _v3_channel9 + w4 * _v4_channel9;
+                                _val_channela = w1 * _v1_channela + w2 * _v2_channela + w3 * _v3_channela + w4 * _v4_channela;
+                                _val_channelb = w1 * _v1_channelb + w2 * _v2_channelb + w3 * _v3_channelb + w4 * _v4_channelb;
+                                _val_channelc = w1 * _v1_channelc + w2 * _v2_channelc + w3 * _v3_channelc + w4 * _v4_channelc;
+                                _val_channeld = w1 * _v1_channeld + w2 * _v2_channeld + w3 * _v3_channeld + w4 * _v4_channeld;
+                                _val_channele = w1 * _v1_channele + w2 * _v2_channele + w3 * _v3_channele + w4 * _v4_channele;
+                                _val_channelf = w1 * _v1_channelf + w2 * _v2_channelf + w3 * _v3_channelf + w4 * _v4_channelf;
+                            }
+                            if (has_mask)
+                            {
+                                _val_channel0 *= mask_;
+                                _val_channel1 *= mask_;
+                                _val_channel2 *= mask_;
+                                _val_channel3 *= mask_;
+                                _val_channel4 *= mask_;
+                                _val_channel5 *= mask_;
+                                _val_channel6 *= mask_;
+                                _val_channel7 *= mask_;
+                                _val_channel8 *= mask_;
+                                _val_channel9 *= mask_;
+                                _val_channela *= mask_;
+                                _val_channelb *= mask_;
+                                _val_channelc *= mask_;
+                                _val_channeld *= mask_;
+                                _val_channele *= mask_;
+                                _val_channelf *= mask_;
+                            }
+                            float _conv_w0 = *(kptr);
+                            float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack
+                            _sum += (_val_channel0 * _conv_w0);
+                            _sum += (_val_channel1 * _conv_w1);
+                            float _conv_w2 = *(kptr + 2); // 2 * out_elempack
+                            float _conv_w3 = *(kptr + 3); // 3 * out_elempack
+                            _sum += (_val_channel2 * _conv_w2);
+                            _sum += (_val_channel3 * _conv_w3);
+                            float _conv_w4 = *(kptr + 4); // 4 * out_elempack
+                            float _conv_w5 = *(kptr + 5); // 5 * out_elempack
+                            _sum += (_val_channel4 * _conv_w4);
+                            _sum += (_val_channel5 * _conv_w5);
+                            float _conv_w6 = *(kptr + 6); // 6 * out_elempack
+                            float _conv_w7 = *(kptr + 7); // 7 * out_elempack
+                            _sum += (_val_channel6 * _conv_w6);
+                            _sum += (_val_channel7 * _conv_w7);
+                            float _conv_w8 = *(kptr + 8); // 8 * out_elempack
+                            float _conv_w9 = *(kptr + 9); // 9 * out_elempack
+                            _sum += (_val_channel8 * _conv_w8);
+                            _sum += (_val_channel9 * _conv_w9);
+                            float _conv_wa = *(kptr + 10); // 10 * out_elempack
+                            float _conv_wb = *(kptr + 11); // 11 * out_elempack
+                            _sum += (_val_channela * _conv_wa);
+                            _sum += (_val_channelb * _conv_wb);
+                            float _conv_wc = *(kptr + 12); // 12 * out_elempack
+                            float _conv_wd = *(kptr + 13); // 13 * out_elempack
+                            _sum += (_val_channelc * _conv_wc);
+                            _sum += (_val_channeld * _conv_wd);
+                            float _conv_we = *(kptr + 14); // 14 * out_elempack
+                            float _conv_wf = *(kptr + 15); // 15 * out_elempack
+                            _sum += (_val_channele * _conv_we);
+                            _sum += (_val_channelf * _conv_wf);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_ss(_sum, activation_type, activation_params);
+                *(outptr + h_col * outw + w_col) = _sum;
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack16to4.h b/src/layer/x86/deformableconv2d_pack16to4.h
new file mode 100644
index 00000000000..a75e26ec8cf
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack16to4.h
@@ -0,0 +1,435 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack16to4_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 16;
+    const int out_elempack = 4;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m128 _sum = _mm_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr);
+                            __m128 _val_channel1 = _val_channel0;
+                            __m128 _val_channel2 = _val_channel0;
+                            __m128 _val_channel3 = _val_channel0;
+                            __m128 _val_channel4 = _val_channel0;
+                            __m128 _val_channel5 = _val_channel0;
+                            __m128 _val_channel6 = _val_channel0;
+                            __m128 _val_channel7 = _val_channel0;
+                            __m128 _val_channel8 = _val_channel0;
+                            __m128 _val_channel9 = _val_channel0;
+                            __m128 _val_channela = _val_channel0;
+                            __m128 _val_channelb = _val_channel0;
+                            __m128 _val_channelc = _val_channel0;
+                            __m128 _val_channeld = _val_channel0;
+                            __m128 _val_channele = _val_channel0;
+                            __m128 _val_channelf = _val_channel0;
+                            if (cond)
+                            {
+                                __m128 _v1_channel0 = _val_channel0;
+                                __m128 _v1_channel1 = _val_channel0;
+                                __m128 _v1_channel2 = _val_channel0;
+                                __m128 _v1_channel3 = _val_channel0;
+                                __m128 _v1_channel4 = _val_channel0;
+                                __m128 _v1_channel5 = _val_channel0;
+                                __m128 _v1_channel6 = _val_channel0;
+                                __m128 _v1_channel7 = _val_channel0;
+                                __m128 _v1_channel8 = _val_channel0;
+                                __m128 _v1_channel9 = _val_channel0;
+                                __m128 _v1_channela = _val_channel0;
+                                __m128 _v1_channelb = _val_channel0;
+                                __m128 _v1_channelc = _val_channel0;
+                                __m128 _v1_channeld = _val_channel0;
+                                __m128 _v1_channele = _val_channel0;
+                                __m128 _v1_channelf = _val_channel0;
+                                __m128 _v2_channel0 = _val_channel0;
+                                __m128 _v2_channel1 = _val_channel0;
+                                __m128 _v2_channel2 = _val_channel0;
+                                __m128 _v2_channel3 = _val_channel0;
+                                __m128 _v2_channel4 = _val_channel0;
+                                __m128 _v2_channel5 = _val_channel0;
+                                __m128 _v2_channel6 = _val_channel0;
+                                __m128 _v2_channel7 = _val_channel0;
+                                __m128 _v2_channel8 = _val_channel0;
+                                __m128 _v2_channel9 = _val_channel0;
+                                __m128 _v2_channela = _val_channel0;
+                                __m128 _v2_channelb = _val_channel0;
+                                __m128 _v2_channelc = _val_channel0;
+                                __m128 _v2_channeld = _val_channel0;
+                                __m128 _v2_channele = _val_channel0;
+                                __m128 _v2_channelf = _val_channel0;
+                                __m128 _v3_channel0 = _val_channel0;
+                                __m128 _v3_channel1 = _val_channel0;
+                                __m128 _v3_channel2 = _val_channel0;
+                                __m128 _v3_channel3 = _val_channel0;
+                                __m128 _v3_channel4 = _val_channel0;
+                                __m128 _v3_channel5 = _val_channel0;
+                                __m128 _v3_channel6 = _val_channel0;
+                                __m128 _v3_channel7 = _val_channel0;
+                                __m128 _v3_channel8 = _val_channel0;
+                                __m128 _v3_channel9 = _val_channel0;
+                                __m128 _v3_channela = _val_channel0;
+                                __m128 _v3_channelb = _val_channel0;
+                                __m128 _v3_channelc = _val_channel0;
+                                __m128 _v3_channeld = _val_channel0;
+                                __m128 _v3_channele = _val_channel0;
+                                __m128 _v3_channelf = _val_channel0;
+                                __m128 _v4_channel0 = _val_channel0;
+                                __m128 _v4_channel1 = _val_channel0;
+                                __m128 _v4_channel2 = _val_channel0;
+                                __m128 _v4_channel3 = _val_channel0;
+                                __m128 _v4_channel4 = _val_channel0;
+                                __m128 _v4_channel5 = _val_channel0;
+                                __m128 _v4_channel6 = _val_channel0;
+                                __m128 _v4_channel7 = _val_channel0;
+                                __m128 _v4_channel8 = _val_channel0;
+                                __m128 _v4_channel9 = _val_channel0;
+                                __m128 _v4_channela = _val_channel0;
+                                __m128 _v4_channelb = _val_channel0;
+                                __m128 _v4_channelc = _val_channel0;
+                                __m128 _v4_channeld = _val_channel0;
+                                __m128 _v4_channele = _val_channel0;
+                                __m128 _v4_channelf = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3);
+                                    _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4);
+                                    _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5);
+                                    _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6);
+                                    _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7);
+                                    _v1_channel8 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 8);
+                                    _v1_channel9 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 9);
+                                    _v1_channela = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 10);
+                                    _v1_channelb = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 11);
+                                    _v1_channelc = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 12);
+                                    _v1_channeld = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 13);
+                                    _v1_channele = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 14);
+                                    _v1_channelf = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 15);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3);
+                                    _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4);
+                                    _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5);
+                                    _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6);
+                                    _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7);
+                                    _v2_channel8 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 8);
+                                    _v2_channel9 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 9);
+                                    _v2_channela = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 10);
+                                    _v2_channelb = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 11);
+                                    _v2_channelc = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 12);
+                                    _v2_channeld = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 13);
+                                    _v2_channele = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 14);
+                                    _v2_channelf = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 15);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3);
+                                    _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4);
+                                    _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5);
+                                    _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6);
+                                    _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7);
+                                    _v3_channel8 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 8);
+                                    _v3_channel9 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 9);
+                                    _v3_channela = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 10);
+                                    _v3_channelb = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 11);
+                                    _v3_channelc = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 12);
+                                    _v3_channeld = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 13);
+                                    _v3_channele = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 14);
+                                    _v3_channelf = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 15);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3);
+                                    _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4);
+                                    _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5);
+                                    _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6);
+                                    _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7);
+                                    _v4_channel8 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 8);
+                                    _v4_channel9 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 9);
+                                    _v4_channela = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 10);
+                                    _v4_channelb = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 11);
+                                    _v4_channelc = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 12);
+                                    _v4_channeld = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 13);
+                                    _v4_channele = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 14);
+                                    _v4_channelf = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 15);
+                                }
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7);
+                                _val_channel8 = _mm_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8);
+                                _val_channel8 = _mm_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8);
+                                _val_channel8 = _mm_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8);
+                                _val_channel8 = _mm_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8);
+                                _val_channel9 = _mm_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9);
+                                _val_channel9 = _mm_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9);
+                                _val_channel9 = _mm_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9);
+                                _val_channel9 = _mm_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9);
+                                _val_channela = _mm_comp_fmadd_ps(_v1_channela, _w1, _val_channela);
+                                _val_channela = _mm_comp_fmadd_ps(_v2_channela, _w2, _val_channela);
+                                _val_channela = _mm_comp_fmadd_ps(_v3_channela, _w3, _val_channela);
+                                _val_channela = _mm_comp_fmadd_ps(_v4_channela, _w4, _val_channela);
+                                _val_channelb = _mm_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb);
+                                _val_channelb = _mm_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb);
+                                _val_channelb = _mm_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb);
+                                _val_channelb = _mm_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb);
+                                _val_channelc = _mm_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc);
+                                _val_channelc = _mm_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc);
+                                _val_channelc = _mm_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc);
+                                _val_channelc = _mm_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc);
+                                _val_channeld = _mm_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld);
+                                _val_channeld = _mm_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld);
+                                _val_channeld = _mm_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld);
+                                _val_channeld = _mm_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld);
+                                _val_channele = _mm_comp_fmadd_ps(_v1_channele, _w1, _val_channele);
+                                _val_channele = _mm_comp_fmadd_ps(_v2_channele, _w2, _val_channele);
+                                _val_channele = _mm_comp_fmadd_ps(_v3_channele, _w3, _val_channele);
+                                _val_channele = _mm_comp_fmadd_ps(_v4_channele, _w4, _val_channele);
+                                _val_channelf = _mm_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf);
+                                _val_channelf = _mm_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf);
+                                _val_channelf = _mm_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf);
+                                _val_channelf = _mm_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm_mul_ps(_val_channel3, _mask);
+                                _val_channel4 = _mm_mul_ps(_val_channel4, _mask);
+                                _val_channel5 = _mm_mul_ps(_val_channel5, _mask);
+                                _val_channel6 = _mm_mul_ps(_val_channel6, _mask);
+                                _val_channel7 = _mm_mul_ps(_val_channel7, _mask);
+                                _val_channel8 = _mm_mul_ps(_val_channel8, _mask);
+                                _val_channel9 = _mm_mul_ps(_val_channel9, _mask);
+                                _val_channela = _mm_mul_ps(_val_channela, _mask);
+                                _val_channelb = _mm_mul_ps(_val_channelb, _mask);
+                                _val_channelc = _mm_mul_ps(_val_channelc, _mask);
+                                _val_channeld = _mm_mul_ps(_val_channeld, _mask);
+                                _val_channele = _mm_mul_ps(_val_channele, _mask);
+                                _val_channelf = _mm_mul_ps(_val_channelf, _mask);
+                            }
+                            __m128 _conv_w0 = _mm_load_ps(kptr);
+                            __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m128 _conv_w2 = _mm_load_ps(kptr + 8);  // 2 * out_elempack
+                            __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack
+                            __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum);
+                            __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack
+                            __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum);
+                            __m128 _conv_w8 = _mm_load_ps(kptr + 32); // 8 * out_elempack
+                            __m128 _conv_w9 = _mm_load_ps(kptr + 36); // 9 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel8, _conv_w8, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel9, _conv_w9, _sum);
+                            __m128 _conv_wa = _mm_load_ps(kptr + 40); // 10 * out_elempack
+                            __m128 _conv_wb = _mm_load_ps(kptr + 44); // 11 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channela, _conv_wa, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channelb, _conv_wb, _sum);
+                            __m128 _conv_wc = _mm_load_ps(kptr + 48); // 12 * out_elempack
+                            __m128 _conv_wd = _mm_load_ps(kptr + 52); // 13 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channelc, _conv_wc, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channeld, _conv_wd, _sum);
+                            __m128 _conv_we = _mm_load_ps(kptr + 56); // 14 * out_elempack
+                            __m128 _conv_wf = _mm_load_ps(kptr + 60); // 15 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channele, _conv_we, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channelf, _conv_wf, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_sse(_sum, activation_type, activation_params);
+                _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack16to8.h b/src/layer/x86/deformableconv2d_pack16to8.h
new file mode 100644
index 00000000000..f44fc9ad0c8
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack16to8.h
@@ -0,0 +1,435 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack16to8_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 16;
+    const int out_elempack = 8;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m256 _sum = _mm256_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr);
+                            __m256 _val_channel1 = _val_channel0;
+                            __m256 _val_channel2 = _val_channel0;
+                            __m256 _val_channel3 = _val_channel0;
+                            __m256 _val_channel4 = _val_channel0;
+                            __m256 _val_channel5 = _val_channel0;
+                            __m256 _val_channel6 = _val_channel0;
+                            __m256 _val_channel7 = _val_channel0;
+                            __m256 _val_channel8 = _val_channel0;
+                            __m256 _val_channel9 = _val_channel0;
+                            __m256 _val_channela = _val_channel0;
+                            __m256 _val_channelb = _val_channel0;
+                            __m256 _val_channelc = _val_channel0;
+                            __m256 _val_channeld = _val_channel0;
+                            __m256 _val_channele = _val_channel0;
+                            __m256 _val_channelf = _val_channel0;
+                            if (cond)
+                            {
+                                __m256 _v1_channel0 = _val_channel0;
+                                __m256 _v1_channel1 = _val_channel0;
+                                __m256 _v1_channel2 = _val_channel0;
+                                __m256 _v1_channel3 = _val_channel0;
+                                __m256 _v1_channel4 = _val_channel0;
+                                __m256 _v1_channel5 = _val_channel0;
+                                __m256 _v1_channel6 = _val_channel0;
+                                __m256 _v1_channel7 = _val_channel0;
+                                __m256 _v1_channel8 = _val_channel0;
+                                __m256 _v1_channel9 = _val_channel0;
+                                __m256 _v1_channela = _val_channel0;
+                                __m256 _v1_channelb = _val_channel0;
+                                __m256 _v1_channelc = _val_channel0;
+                                __m256 _v1_channeld = _val_channel0;
+                                __m256 _v1_channele = _val_channel0;
+                                __m256 _v1_channelf = _val_channel0;
+                                __m256 _v2_channel0 = _val_channel0;
+                                __m256 _v2_channel1 = _val_channel0;
+                                __m256 _v2_channel2 = _val_channel0;
+                                __m256 _v2_channel3 = _val_channel0;
+                                __m256 _v2_channel4 = _val_channel0;
+                                __m256 _v2_channel5 = _val_channel0;
+                                __m256 _v2_channel6 = _val_channel0;
+                                __m256 _v2_channel7 = _val_channel0;
+                                __m256 _v2_channel8 = _val_channel0;
+                                __m256 _v2_channel9 = _val_channel0;
+                                __m256 _v2_channela = _val_channel0;
+                                __m256 _v2_channelb = _val_channel0;
+                                __m256 _v2_channelc = _val_channel0;
+                                __m256 _v2_channeld = _val_channel0;
+                                __m256 _v2_channele = _val_channel0;
+                                __m256 _v2_channelf = _val_channel0;
+                                __m256 _v3_channel0 = _val_channel0;
+                                __m256 _v3_channel1 = _val_channel0;
+                                __m256 _v3_channel2 = _val_channel0;
+                                __m256 _v3_channel3 = _val_channel0;
+                                __m256 _v3_channel4 = _val_channel0;
+                                __m256 _v3_channel5 = _val_channel0;
+                                __m256 _v3_channel6 = _val_channel0;
+                                __m256 _v3_channel7 = _val_channel0;
+                                __m256 _v3_channel8 = _val_channel0;
+                                __m256 _v3_channel9 = _val_channel0;
+                                __m256 _v3_channela = _val_channel0;
+                                __m256 _v3_channelb = _val_channel0;
+                                __m256 _v3_channelc = _val_channel0;
+                                __m256 _v3_channeld = _val_channel0;
+                                __m256 _v3_channele = _val_channel0;
+                                __m256 _v3_channelf = _val_channel0;
+                                __m256 _v4_channel0 = _val_channel0;
+                                __m256 _v4_channel1 = _val_channel0;
+                                __m256 _v4_channel2 = _val_channel0;
+                                __m256 _v4_channel3 = _val_channel0;
+                                __m256 _v4_channel4 = _val_channel0;
+                                __m256 _v4_channel5 = _val_channel0;
+                                __m256 _v4_channel6 = _val_channel0;
+                                __m256 _v4_channel7 = _val_channel0;
+                                __m256 _v4_channel8 = _val_channel0;
+                                __m256 _v4_channel9 = _val_channel0;
+                                __m256 _v4_channela = _val_channel0;
+                                __m256 _v4_channelb = _val_channel0;
+                                __m256 _v4_channelc = _val_channel0;
+                                __m256 _v4_channeld = _val_channel0;
+                                __m256 _v4_channele = _val_channel0;
+                                __m256 _v4_channelf = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3);
+                                    _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4);
+                                    _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5);
+                                    _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6);
+                                    _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7);
+                                    _v1_channel8 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 8);
+                                    _v1_channel9 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 9);
+                                    _v1_channela = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 10);
+                                    _v1_channelb = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 11);
+                                    _v1_channelc = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 12);
+                                    _v1_channeld = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 13);
+                                    _v1_channele = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 14);
+                                    _v1_channelf = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 15);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3);
+                                    _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4);
+                                    _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5);
+                                    _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6);
+                                    _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7);
+                                    _v2_channel8 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 8);
+                                    _v2_channel9 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 9);
+                                    _v2_channela = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 10);
+                                    _v2_channelb = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 11);
+                                    _v2_channelc = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 12);
+                                    _v2_channeld = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 13);
+                                    _v2_channele = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 14);
+                                    _v2_channelf = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 15);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3);
+                                    _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4);
+                                    _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5);
+                                    _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6);
+                                    _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7);
+                                    _v3_channel8 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 8);
+                                    _v3_channel9 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 9);
+                                    _v3_channela = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 10);
+                                    _v3_channelb = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 11);
+                                    _v3_channelc = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 12);
+                                    _v3_channeld = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 13);
+                                    _v3_channele = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 14);
+                                    _v3_channelf = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 15);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3);
+                                    _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4);
+                                    _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5);
+                                    _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6);
+                                    _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7);
+                                    _v4_channel8 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 8);
+                                    _v4_channel9 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 9);
+                                    _v4_channela = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 10);
+                                    _v4_channelb = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 11);
+                                    _v4_channelc = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 12);
+                                    _v4_channeld = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 13);
+                                    _v4_channele = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 14);
+                                    _v4_channelf = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 15);
+                                }
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7);
+                                _val_channel8 = _mm256_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8);
+                                _val_channel8 = _mm256_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8);
+                                _val_channel8 = _mm256_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8);
+                                _val_channel8 = _mm256_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8);
+                                _val_channel9 = _mm256_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9);
+                                _val_channel9 = _mm256_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9);
+                                _val_channel9 = _mm256_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9);
+                                _val_channel9 = _mm256_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9);
+                                _val_channela = _mm256_comp_fmadd_ps(_v1_channela, _w1, _val_channela);
+                                _val_channela = _mm256_comp_fmadd_ps(_v2_channela, _w2, _val_channela);
+                                _val_channela = _mm256_comp_fmadd_ps(_v3_channela, _w3, _val_channela);
+                                _val_channela = _mm256_comp_fmadd_ps(_v4_channela, _w4, _val_channela);
+                                _val_channelb = _mm256_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb);
+                                _val_channelb = _mm256_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb);
+                                _val_channelb = _mm256_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb);
+                                _val_channelb = _mm256_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb);
+                                _val_channelc = _mm256_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc);
+                                _val_channelc = _mm256_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc);
+                                _val_channelc = _mm256_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc);
+                                _val_channelc = _mm256_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc);
+                                _val_channeld = _mm256_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld);
+                                _val_channeld = _mm256_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld);
+                                _val_channeld = _mm256_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld);
+                                _val_channeld = _mm256_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld);
+                                _val_channele = _mm256_comp_fmadd_ps(_v1_channele, _w1, _val_channele);
+                                _val_channele = _mm256_comp_fmadd_ps(_v2_channele, _w2, _val_channele);
+                                _val_channele = _mm256_comp_fmadd_ps(_v3_channele, _w3, _val_channele);
+                                _val_channele = _mm256_comp_fmadd_ps(_v4_channele, _w4, _val_channele);
+                                _val_channelf = _mm256_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf);
+                                _val_channelf = _mm256_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf);
+                                _val_channelf = _mm256_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf);
+                                _val_channelf = _mm256_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm256_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm256_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm256_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm256_mul_ps(_val_channel3, _mask);
+                                _val_channel4 = _mm256_mul_ps(_val_channel4, _mask);
+                                _val_channel5 = _mm256_mul_ps(_val_channel5, _mask);
+                                _val_channel6 = _mm256_mul_ps(_val_channel6, _mask);
+                                _val_channel7 = _mm256_mul_ps(_val_channel7, _mask);
+                                _val_channel8 = _mm256_mul_ps(_val_channel8, _mask);
+                                _val_channel9 = _mm256_mul_ps(_val_channel9, _mask);
+                                _val_channela = _mm256_mul_ps(_val_channela, _mask);
+                                _val_channelb = _mm256_mul_ps(_val_channelb, _mask);
+                                _val_channelc = _mm256_mul_ps(_val_channelc, _mask);
+                                _val_channeld = _mm256_mul_ps(_val_channeld, _mask);
+                                _val_channele = _mm256_mul_ps(_val_channele, _mask);
+                                _val_channelf = _mm256_mul_ps(_val_channelf, _mask);
+                            }
+                            __m256 _conv_w0 = _mm256_load_ps(kptr);
+                            __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack
+                            __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack
+                            __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum);
+                            __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack
+                            __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum);
+                            __m256 _conv_w8 = _mm256_load_ps(kptr + 64); // 8 * out_elempack
+                            __m256 _conv_w9 = _mm256_load_ps(kptr + 72); // 9 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel8, _conv_w8, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel9, _conv_w9, _sum);
+                            __m256 _conv_wa = _mm256_load_ps(kptr + 80); // 10 * out_elempack
+                            __m256 _conv_wb = _mm256_load_ps(kptr + 88); // 11 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channela, _conv_wa, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channelb, _conv_wb, _sum);
+                            __m256 _conv_wc = _mm256_load_ps(kptr + 96);  // 12 * out_elempack
+                            __m256 _conv_wd = _mm256_load_ps(kptr + 104); // 13 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channelc, _conv_wc, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channeld, _conv_wd, _sum);
+                            __m256 _conv_we = _mm256_load_ps(kptr + 112); // 14 * out_elempack
+                            __m256 _conv_wf = _mm256_load_ps(kptr + 120); // 15 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channele, _conv_we, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channelf, _conv_wf, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx(_sum, activation_type, activation_params);
+                _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack1to16.h b/src/layer/x86/deformableconv2d_pack1to16.h
new file mode 100644
index 00000000000..b50e787e9c8
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack1to16.h
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack1to16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 1;
+    const int out_elempack = 16;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m512 _sum = _mm512_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m512 _v1_channel0 = _val_channel0;
+                                __m512 _v2_channel0 = _val_channel0;
+                                __m512 _v3_channel0 = _val_channel0;
+                                __m512 _v4_channel0 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos]);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos]);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos]);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos]);
+                                }
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm512_mul_ps(_val_channel0, _mask);
+                            }
+                            __m512 _conv_w0 = _mm512_load_ps(kptr);
+                            _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx512(_sum, activation_type, activation_params);
+                _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack1to4.h b/src/layer/x86/deformableconv2d_pack1to4.h
new file mode 100644
index 00000000000..0388111306f
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack1to4.h
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack1to4_sse(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 1;
+    const int out_elempack = 4;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m128 _sum = _mm_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m128 _v1_channel0 = _val_channel0;
+                                __m128 _v2_channel0 = _val_channel0;
+                                __m128 _v3_channel0 = _val_channel0;
+                                __m128 _v4_channel0 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos);
+                                }
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm_mul_ps(_val_channel0, _mask);
+                            }
+                            __m128 _conv_w0 = _mm_load_ps(kptr);
+                            _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_sse(_sum, activation_type, activation_params);
+                _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack1to8.h b/src/layer/x86/deformableconv2d_pack1to8.h
new file mode 100644
index 00000000000..fe1e0c8c0a6
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack1to8.h
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack1to8_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 1;
+    const int out_elempack = 8;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m256 _sum = _mm256_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m256 _v1_channel0 = _val_channel0;
+                                __m256 _v2_channel0 = _val_channel0;
+                                __m256 _v3_channel0 = _val_channel0;
+                                __m256 _v4_channel0 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos);
+                                }
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm256_mul_ps(_val_channel0, _mask);
+                            }
+                            __m256 _conv_w0 = _mm256_load_ps(kptr);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx(_sum, activation_type, activation_params);
+                _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack4.h b/src/layer/x86/deformableconv2d_pack4.h
new file mode 100644
index 00000000000..32b27963fb1
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack4.h
@@ -0,0 +1,243 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack4_sse(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 4;
+    const int out_elempack = 4;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m128 _sum = _mm_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr);
+                            __m128 _val_channel1 = _val_channel0;
+                            __m128 _val_channel2 = _val_channel0;
+                            __m128 _val_channel3 = _val_channel0;
+                            if (cond)
+                            {
+                                __m128 _v1_channel0 = _val_channel0;
+                                __m128 _v1_channel1 = _val_channel0;
+                                __m128 _v1_channel2 = _val_channel0;
+                                __m128 _v1_channel3 = _val_channel0;
+                                __m128 _v2_channel0 = _val_channel0;
+                                __m128 _v2_channel1 = _val_channel0;
+                                __m128 _v2_channel2 = _val_channel0;
+                                __m128 _v2_channel3 = _val_channel0;
+                                __m128 _v3_channel0 = _val_channel0;
+                                __m128 _v3_channel1 = _val_channel0;
+                                __m128 _v3_channel2 = _val_channel0;
+                                __m128 _v3_channel3 = _val_channel0;
+                                __m128 _v4_channel0 = _val_channel0;
+                                __m128 _v4_channel1 = _val_channel0;
+                                __m128 _v4_channel2 = _val_channel0;
+                                __m128 _v4_channel3 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3);
+                                }
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm_mul_ps(_val_channel3, _mask);
+                            }
+                            __m128 _conv_w0 = _mm_load_ps(kptr);
+                            __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m128 _conv_w2 = _mm_load_ps(kptr + 8);  // 2 * out_elempack
+                            __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_sse(_sum, activation_type, activation_params);
+                _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack4to1.h b/src/layer/x86/deformableconv2d_pack4to1.h
new file mode 100644
index 00000000000..7ee073a91cb
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack4to1.h
@@ -0,0 +1,211 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack4to1_sse(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 4;
+    const int out_elempack = 1;
+    const int wstep = out_elempack * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                float _sum = 0.f;
+                if (bias_data_ptr)
+                    _sum = *(bias_data_ptr + oc);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            float _val_channel0 = 0.f;
+                            float _val_channel1 = _val_channel0;
+                            float _val_channel2 = _val_channel0;
+                            float _val_channel3 = _val_channel0;
+                            if (cond)
+                            {
+                                float _v1_channel0 = _val_channel0;
+                                float _v1_channel1 = _val_channel0;
+                                float _v1_channel2 = _val_channel0;
+                                float _v1_channel3 = _val_channel0;
+                                float _v2_channel0 = _val_channel0;
+                                float _v2_channel1 = _val_channel0;
+                                float _v2_channel2 = _val_channel0;
+                                float _v2_channel3 = _val_channel0;
+                                float _v3_channel0 = _val_channel0;
+                                float _v3_channel1 = _val_channel0;
+                                float _v3_channel2 = _val_channel0;
+                                float _v3_channel3 = _val_channel0;
+                                float _v4_channel0 = _val_channel0;
+                                float _v4_channel1 = _val_channel0;
+                                float _v4_channel2 = _val_channel0;
+                                float _v4_channel3 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = *(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = *(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = *(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = *(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3);
+                                }
+                                _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0;
+                                _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1;
+                                _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2;
+                                _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3;
+                            }
+                            if (has_mask)
+                            {
+                                _val_channel0 *= mask_;
+                                _val_channel1 *= mask_;
+                                _val_channel2 *= mask_;
+                                _val_channel3 *= mask_;
+                            }
+                            float _conv_w0 = *(kptr);
+                            float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack
+                            float _conv_w2 = *(kptr + 2);            // 2 * out_elempack
+                            float _conv_w3 = *(kptr + 3);            // 3 * out_elempack
+                            _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_ss(_sum, activation_type, activation_params);
+                *(outptr + h_col * outw + w_col) = _sum;
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack4to16.h b/src/layer/x86/deformableconv2d_pack4to16.h
new file mode 100644
index 00000000000..809bb7cb2b5
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack4to16.h
@@ -0,0 +1,243 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack4to16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 4;
+    const int out_elempack = 16;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m512 _sum = _mm512_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr);
+                            __m512 _val_channel1 = _val_channel0;
+                            __m512 _val_channel2 = _val_channel0;
+                            __m512 _val_channel3 = _val_channel0;
+                            if (cond)
+                            {
+                                __m512 _v1_channel0 = _val_channel0;
+                                __m512 _v1_channel1 = _val_channel0;
+                                __m512 _v1_channel2 = _val_channel0;
+                                __m512 _v1_channel3 = _val_channel0;
+                                __m512 _v2_channel0 = _val_channel0;
+                                __m512 _v2_channel1 = _val_channel0;
+                                __m512 _v2_channel2 = _val_channel0;
+                                __m512 _v2_channel3 = _val_channel0;
+                                __m512 _v3_channel0 = _val_channel0;
+                                __m512 _v3_channel1 = _val_channel0;
+                                __m512 _v3_channel2 = _val_channel0;
+                                __m512 _v3_channel3 = _val_channel0;
+                                __m512 _v4_channel0 = _val_channel0;
+                                __m512 _v4_channel1 = _val_channel0;
+                                __m512 _v4_channel2 = _val_channel0;
+                                __m512 _v4_channel3 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]);
+                                    _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]);
+                                    _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]);
+                                    _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]);
+                                    _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]);
+                                    _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]);
+                                    _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]);
+                                    _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]);
+                                    _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]);
+                                    _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]);
+                                    _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]);
+                                    _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]);
+                                    _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]);
+                                }
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm512_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm512_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm512_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm512_mul_ps(_val_channel3, _mask);
+                            }
+                            __m512 _conv_w0 = _mm512_load_ps(kptr);
+                            __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack
+                            __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx512(_sum, activation_type, activation_params);
+                _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack4to8.h b/src/layer/x86/deformableconv2d_pack4to8.h
new file mode 100644
index 00000000000..84099691826
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack4to8.h
@@ -0,0 +1,243 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack4to8_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 4;
+    const int out_elempack = 8;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m256 _sum = _mm256_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr);
+                            __m256 _val_channel1 = _val_channel0;
+                            __m256 _val_channel2 = _val_channel0;
+                            __m256 _val_channel3 = _val_channel0;
+                            if (cond)
+                            {
+                                __m256 _v1_channel0 = _val_channel0;
+                                __m256 _v1_channel1 = _val_channel0;
+                                __m256 _v1_channel2 = _val_channel0;
+                                __m256 _v1_channel3 = _val_channel0;
+                                __m256 _v2_channel0 = _val_channel0;
+                                __m256 _v2_channel1 = _val_channel0;
+                                __m256 _v2_channel2 = _val_channel0;
+                                __m256 _v2_channel3 = _val_channel0;
+                                __m256 _v3_channel0 = _val_channel0;
+                                __m256 _v3_channel1 = _val_channel0;
+                                __m256 _v3_channel2 = _val_channel0;
+                                __m256 _v3_channel3 = _val_channel0;
+                                __m256 _v4_channel0 = _val_channel0;
+                                __m256 _v4_channel1 = _val_channel0;
+                                __m256 _v4_channel2 = _val_channel0;
+                                __m256 _v4_channel3 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3);
+                                }
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm256_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm256_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm256_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm256_mul_ps(_val_channel3, _mask);
+                            }
+                            __m256 _conv_w0 = _mm256_load_ps(kptr);
+                            __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack
+                            __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx(_sum, activation_type, activation_params);
+                _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack8.h b/src/layer/x86/deformableconv2d_pack8.h
new file mode 100644
index 00000000000..277817e3948
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack8.h
@@ -0,0 +1,307 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack8_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 8;
+    const int out_elempack = 8;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m256 _sum = _mm256_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr);
+                            __m256 _val_channel1 = _val_channel0;
+                            __m256 _val_channel2 = _val_channel0;
+                            __m256 _val_channel3 = _val_channel0;
+                            __m256 _val_channel4 = _val_channel0;
+                            __m256 _val_channel5 = _val_channel0;
+                            __m256 _val_channel6 = _val_channel0;
+                            __m256 _val_channel7 = _val_channel0;
+                            if (cond)
+                            {
+                                __m256 _v1_channel0 = _val_channel0;
+                                __m256 _v1_channel1 = _val_channel0;
+                                __m256 _v1_channel2 = _val_channel0;
+                                __m256 _v1_channel3 = _val_channel0;
+                                __m256 _v1_channel4 = _val_channel0;
+                                __m256 _v1_channel5 = _val_channel0;
+                                __m256 _v1_channel6 = _val_channel0;
+                                __m256 _v1_channel7 = _val_channel0;
+                                __m256 _v2_channel0 = _val_channel0;
+                                __m256 _v2_channel1 = _val_channel0;
+                                __m256 _v2_channel2 = _val_channel0;
+                                __m256 _v2_channel3 = _val_channel0;
+                                __m256 _v2_channel4 = _val_channel0;
+                                __m256 _v2_channel5 = _val_channel0;
+                                __m256 _v2_channel6 = _val_channel0;
+                                __m256 _v2_channel7 = _val_channel0;
+                                __m256 _v3_channel0 = _val_channel0;
+                                __m256 _v3_channel1 = _val_channel0;
+                                __m256 _v3_channel2 = _val_channel0;
+                                __m256 _v3_channel3 = _val_channel0;
+                                __m256 _v3_channel4 = _val_channel0;
+                                __m256 _v3_channel5 = _val_channel0;
+                                __m256 _v3_channel6 = _val_channel0;
+                                __m256 _v3_channel7 = _val_channel0;
+                                __m256 _v4_channel0 = _val_channel0;
+                                __m256 _v4_channel1 = _val_channel0;
+                                __m256 _v4_channel2 = _val_channel0;
+                                __m256 _v4_channel3 = _val_channel0;
+                                __m256 _v4_channel4 = _val_channel0;
+                                __m256 _v4_channel5 = _val_channel0;
+                                __m256 _v4_channel6 = _val_channel0;
+                                __m256 _v4_channel7 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3);
+                                    _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4);
+                                    _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5);
+                                    _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6);
+                                    _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3);
+                                    _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4);
+                                    _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5);
+                                    _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6);
+                                    _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3);
+                                    _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4);
+                                    _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5);
+                                    _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6);
+                                    _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3);
+                                    _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4);
+                                    _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5);
+                                    _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6);
+                                    _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7);
+                                }
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4);
+                                _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5);
+                                _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6);
+                                _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7);
+                                _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm256_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm256_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm256_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm256_mul_ps(_val_channel3, _mask);
+                                _val_channel4 = _mm256_mul_ps(_val_channel4, _mask);
+                                _val_channel5 = _mm256_mul_ps(_val_channel5, _mask);
+                                _val_channel6 = _mm256_mul_ps(_val_channel6, _mask);
+                                _val_channel7 = _mm256_mul_ps(_val_channel7, _mask);
+                            }
+                            __m256 _conv_w0 = _mm256_load_ps(kptr);
+                            __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack
+                            __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack
+                            __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum);
+                            __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack
+                            __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack
+                            _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum);
+                            _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx(_sum, activation_type, activation_params);
+                _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack8to1.h b/src/layer/x86/deformableconv2d_pack8to1.h
new file mode 100644
index 00000000000..c4b97b40f06
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack8to1.h
@@ -0,0 +1,259 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack8to1_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 8;
+    const int out_elempack = 1;
+    const int wstep = out_elempack * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                float _sum = 0.f;
+                if (bias_data_ptr)
+                    _sum = *(bias_data_ptr + oc);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            float _val_channel0 = 0.f;
+                            float _val_channel1 = _val_channel0;
+                            float _val_channel2 = _val_channel0;
+                            float _val_channel3 = _val_channel0;
+                            float _val_channel4 = _val_channel0;
+                            float _val_channel5 = _val_channel0;
+                            float _val_channel6 = _val_channel0;
+                            float _val_channel7 = _val_channel0;
+                            if (cond)
+                            {
+                                float _v1_channel0 = _val_channel0;
+                                float _v1_channel1 = _val_channel0;
+                                float _v1_channel2 = _val_channel0;
+                                float _v1_channel3 = _val_channel0;
+                                float _v1_channel4 = _val_channel0;
+                                float _v1_channel5 = _val_channel0;
+                                float _v1_channel6 = _val_channel0;
+                                float _v1_channel7 = _val_channel0;
+                                float _v2_channel0 = _val_channel0;
+                                float _v2_channel1 = _val_channel0;
+                                float _v2_channel2 = _val_channel0;
+                                float _v2_channel3 = _val_channel0;
+                                float _v2_channel4 = _val_channel0;
+                                float _v2_channel5 = _val_channel0;
+                                float _v2_channel6 = _val_channel0;
+                                float _v2_channel7 = _val_channel0;
+                                float _v3_channel0 = _val_channel0;
+                                float _v3_channel1 = _val_channel0;
+                                float _v3_channel2 = _val_channel0;
+                                float _v3_channel3 = _val_channel0;
+                                float _v3_channel4 = _val_channel0;
+                                float _v3_channel5 = _val_channel0;
+                                float _v3_channel6 = _val_channel0;
+                                float _v3_channel7 = _val_channel0;
+                                float _v4_channel0 = _val_channel0;
+                                float _v4_channel1 = _val_channel0;
+                                float _v4_channel2 = _val_channel0;
+                                float _v4_channel3 = _val_channel0;
+                                float _v4_channel4 = _val_channel0;
+                                float _v4_channel5 = _val_channel0;
+                                float _v4_channel6 = _val_channel0;
+                                float _v4_channel7 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = *(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3);
+                                    _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4);
+                                    _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5);
+                                    _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6);
+                                    _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = *(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3);
+                                    _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4);
+                                    _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5);
+                                    _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6);
+                                    _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = *(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3);
+                                    _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4);
+                                    _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5);
+                                    _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6);
+                                    _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = *(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3);
+                                    _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4);
+                                    _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5);
+                                    _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6);
+                                    _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7);
+                                }
+                                _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0;
+                                _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1;
+                                _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2;
+                                _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3;
+                                _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4;
+                                _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5;
+                                _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6;
+                                _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7;
+                            }
+                            if (has_mask)
+                            {
+                                _val_channel0 *= mask_;
+                                _val_channel1 *= mask_;
+                                _val_channel2 *= mask_;
+                                _val_channel3 *= mask_;
+                                _val_channel4 *= mask_;
+                                _val_channel5 *= mask_;
+                                _val_channel6 *= mask_;
+                                _val_channel7 *= mask_;
+                            }
+                            float _conv_w0 = *(kptr);
+                            float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack
+                            float _conv_w2 = *(kptr + 2);            // 2 * out_elempack
+                            float _conv_w3 = *(kptr + 3);            // 3 * out_elempack
+                            float _conv_w4 = *(kptr + 4);            // 4 * out_elempack
+                            float _conv_w5 = *(kptr + 5);            // 5 * out_elempack
+                            float _conv_w6 = *(kptr + 6);            // 6 * out_elempack
+                            float _conv_w7 = *(kptr + 7);            // 7 * out_elempack
+                            _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3 + _val_channel4 * _conv_w4 + _val_channel5 * _conv_w5 + _val_channel6 * _conv_w6 + _val_channel7 * _conv_w7);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_ss(_sum, activation_type, activation_params);
+                *(outptr + h_col * outw + w_col) = _sum;
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack8to16.h b/src/layer/x86/deformableconv2d_pack8to16.h
new file mode 100644
index 00000000000..15e5ed076e6
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack8to16.h
@@ -0,0 +1,307 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack8to16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 8;
+    const int out_elempack = 16;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m512 _sum = _mm512_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr);
+                            __m512 _val_channel1 = _val_channel0;
+                            __m512 _val_channel2 = _val_channel0;
+                            __m512 _val_channel3 = _val_channel0;
+                            __m512 _val_channel4 = _val_channel0;
+                            __m512 _val_channel5 = _val_channel0;
+                            __m512 _val_channel6 = _val_channel0;
+                            __m512 _val_channel7 = _val_channel0;
+                            if (cond)
+                            {
+                                __m512 _v1_channel0 = _val_channel0;
+                                __m512 _v1_channel1 = _val_channel0;
+                                __m512 _v1_channel2 = _val_channel0;
+                                __m512 _v1_channel3 = _val_channel0;
+                                __m512 _v1_channel4 = _val_channel0;
+                                __m512 _v1_channel5 = _val_channel0;
+                                __m512 _v1_channel6 = _val_channel0;
+                                __m512 _v1_channel7 = _val_channel0;
+                                __m512 _v2_channel0 = _val_channel0;
+                                __m512 _v2_channel1 = _val_channel0;
+                                __m512 _v2_channel2 = _val_channel0;
+                                __m512 _v2_channel3 = _val_channel0;
+                                __m512 _v2_channel4 = _val_channel0;
+                                __m512 _v2_channel5 = _val_channel0;
+                                __m512 _v2_channel6 = _val_channel0;
+                                __m512 _v2_channel7 = _val_channel0;
+                                __m512 _v3_channel0 = _val_channel0;
+                                __m512 _v3_channel1 = _val_channel0;
+                                __m512 _v3_channel2 = _val_channel0;
+                                __m512 _v3_channel3 = _val_channel0;
+                                __m512 _v3_channel4 = _val_channel0;
+                                __m512 _v3_channel5 = _val_channel0;
+                                __m512 _v3_channel6 = _val_channel0;
+                                __m512 _v3_channel7 = _val_channel0;
+                                __m512 _v4_channel0 = _val_channel0;
+                                __m512 _v4_channel1 = _val_channel0;
+                                __m512 _v4_channel2 = _val_channel0;
+                                __m512 _v4_channel3 = _val_channel0;
+                                __m512 _v4_channel4 = _val_channel0;
+                                __m512 _v4_channel5 = _val_channel0;
+                                __m512 _v4_channel6 = _val_channel0;
+                                __m512 _v4_channel7 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]);
+                                    _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]);
+                                    _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]);
+                                    _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]);
+                                    _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]);
+                                    _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]);
+                                    _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]);
+                                    _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]);
+                                    _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]);
+                                    _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]);
+                                    _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]);
+                                    _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]);
+                                    _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]);
+                                    _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]);
+                                    _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]);
+                                    _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]);
+                                    _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]);
+                                    _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]);
+                                    _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]);
+                                    _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]);
+                                    _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]);
+                                    _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]);
+                                    _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]);
+                                    _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]);
+                                    _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]);
+                                    _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]);
+                                    _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]);
+                                    _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]);
+                                    _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]);
+                                }
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                                _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4);
+                                _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4);
+                                _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4);
+                                _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4);
+                                _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5);
+                                _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5);
+                                _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5);
+                                _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5);
+                                _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6);
+                                _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6);
+                                _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6);
+                                _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6);
+                                _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7);
+                                _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7);
+                                _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7);
+                                _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm512_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm512_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm512_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm512_mul_ps(_val_channel3, _mask);
+                                _val_channel4 = _mm512_mul_ps(_val_channel4, _mask);
+                                _val_channel5 = _mm512_mul_ps(_val_channel5, _mask);
+                                _val_channel6 = _mm512_mul_ps(_val_channel6, _mask);
+                                _val_channel7 = _mm512_mul_ps(_val_channel7, _mask);
+                            }
+                            __m512 _conv_w0 = _mm512_load_ps(kptr);
+                            __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack
+                            __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack
+                            __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum);
+                            __m512 _conv_w6 = _mm512_load_ps(kptr + 96);  // 6 * out_elempack
+                            __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack
+                            _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum);
+                            _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_avx512(_sum, activation_type, activation_params);
+                _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_pack8to4.h b/src/layer/x86/deformableconv2d_pack8to4.h
new file mode 100644
index 00000000000..85aa06aaa03
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_pack8to4.h
@@ -0,0 +1,307 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack8to4_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+    const int size = outw * outh;
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+    const int elempack = 8;
+    const int out_elempack = 4;
+    const int wstep = out_elempack * elempack;
+    const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                __m128 _sum = _mm_setzero_ps();
+                if (bias_data_ptr)
+                    _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack);
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[out_elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[out_elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[out_elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[out_elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[out_elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr);
+                            __m128 _val_channel1 = _val_channel0;
+                            __m128 _val_channel2 = _val_channel0;
+                            __m128 _val_channel3 = _val_channel0;
+                            __m128 _val_channel4 = _val_channel0;
+                            __m128 _val_channel5 = _val_channel0;
+                            __m128 _val_channel6 = _val_channel0;
+                            __m128 _val_channel7 = _val_channel0;
+                            if (cond)
+                            {
+                                __m128 _v1_channel0 = _val_channel0;
+                                __m128 _v1_channel1 = _val_channel0;
+                                __m128 _v1_channel2 = _val_channel0;
+                                __m128 _v1_channel3 = _val_channel0;
+                                __m128 _v1_channel4 = _val_channel0;
+                                __m128 _v1_channel5 = _val_channel0;
+                                __m128 _v1_channel6 = _val_channel0;
+                                __m128 _v1_channel7 = _val_channel0;
+                                __m128 _v2_channel0 = _val_channel0;
+                                __m128 _v2_channel1 = _val_channel0;
+                                __m128 _v2_channel2 = _val_channel0;
+                                __m128 _v2_channel3 = _val_channel0;
+                                __m128 _v2_channel4 = _val_channel0;
+                                __m128 _v2_channel5 = _val_channel0;
+                                __m128 _v2_channel6 = _val_channel0;
+                                __m128 _v2_channel7 = _val_channel0;
+                                __m128 _v3_channel0 = _val_channel0;
+                                __m128 _v3_channel1 = _val_channel0;
+                                __m128 _v3_channel2 = _val_channel0;
+                                __m128 _v3_channel3 = _val_channel0;
+                                __m128 _v3_channel4 = _val_channel0;
+                                __m128 _v3_channel5 = _val_channel0;
+                                __m128 _v3_channel6 = _val_channel0;
+                                __m128 _v3_channel7 = _val_channel0;
+                                __m128 _v4_channel0 = _val_channel0;
+                                __m128 _v4_channel1 = _val_channel0;
+                                __m128 _v4_channel2 = _val_channel0;
+                                __m128 _v4_channel3 = _val_channel0;
+                                __m128 _v4_channel4 = _val_channel0;
+                                __m128 _v4_channel5 = _val_channel0;
+                                __m128 _v4_channel6 = _val_channel0;
+                                __m128 _v4_channel7 = _val_channel0;
+                                if (v1_cond)
+                                {
+                                    _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack);
+                                    _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1);
+                                    _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2);
+                                    _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3);
+                                    _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4);
+                                    _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5);
+                                    _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6);
+                                    _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7);
+                                }
+                                if (v2_cond)
+                                {
+                                    _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack);
+                                    _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1);
+                                    _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2);
+                                    _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3);
+                                    _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4);
+                                    _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5);
+                                    _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6);
+                                    _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7);
+                                }
+                                if (v3_cond)
+                                {
+                                    _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack);
+                                    _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1);
+                                    _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2);
+                                    _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3);
+                                    _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4);
+                                    _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5);
+                                    _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6);
+                                    _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7);
+                                }
+                                if (v4_cond)
+                                {
+                                    _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack);
+                                    _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1);
+                                    _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2);
+                                    _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3);
+                                    _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4);
+                                    _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5);
+                                    _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6);
+                                    _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7);
+                                }
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0);
+                                _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1);
+                                _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2);
+                                _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3);
+                                _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4);
+                                _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5);
+                                _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6);
+                                _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7);
+                                _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val_channel0 = _mm_mul_ps(_val_channel0, _mask);
+                                _val_channel1 = _mm_mul_ps(_val_channel1, _mask);
+                                _val_channel2 = _mm_mul_ps(_val_channel2, _mask);
+                                _val_channel3 = _mm_mul_ps(_val_channel3, _mask);
+                                _val_channel4 = _mm_mul_ps(_val_channel4, _mask);
+                                _val_channel5 = _mm_mul_ps(_val_channel5, _mask);
+                                _val_channel6 = _mm_mul_ps(_val_channel6, _mask);
+                                _val_channel7 = _mm_mul_ps(_val_channel7, _mask);
+                            }
+                            __m128 _conv_w0 = _mm_load_ps(kptr);
+                            __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum);
+                            __m128 _conv_w2 = _mm_load_ps(kptr + 8);  // 2 * out_elempack
+                            __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum);
+                            __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack
+                            __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum);
+                            __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack
+                            __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack
+                            _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum);
+                            _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum);
+                            kptr += wstep;
+                        }
+                    }
+                }
+                _sum = activation_sse(_sum, activation_type, activation_params);
+                _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum);
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm.h b/src/layer/x86/deformableconv2d_sgemm.h
new file mode 100644
index 00000000000..648af448b12
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm.h"
+
+static void deformableconv2d_im2col_sgemm_sse(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        int h_low = 0;
+                        int w_low = 0;
+                        int h_high = 0;
+                        int w_high = 0;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        if (cond)
+                        {
+                            h_low = floor(h_im);
+                            w_low = floor(w_im);
+                            h_high = h_low + 1;
+                            w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            float val = 0.f;
+                            if (cond)
+                            {
+                                float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f;
+                                float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f;
+                                float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f;
+                                float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f;
+                                val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                            }
+                            bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_sse(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16.h b/src/layer/x86/deformableconv2d_sgemm_pack16.h
new file mode 100644
index 00000000000..37aab40f1e4
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack16.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack16.h"
+
+static void deformableconv2d_im2col_sgemm_pack16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 16;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val = _mm512_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m512 _v1 = _val;
+                                __m512 _v2 = _val;
+                                __m512 _v3 = _val;
+                                __m512 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val = _mm512_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm512_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm512_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm512_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val = _mm512_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack16_avx512(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to1.h b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h
new file mode 100644
index 00000000000..686333e6ee4
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack16to1.h"
+
+static void deformableconv2d_im2col_sgemm_pack16to1_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 16;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val = _mm512_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m512 _v1 = _val;
+                                __m512 _v2 = _val;
+                                __m512 _v3 = _val;
+                                __m512 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val = _mm512_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm512_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm512_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm512_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val = _mm512_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack16to1_avx512(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to4.h b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h
new file mode 100644
index 00000000000..a7438d1f983
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack16to4.h"
+
+static void deformableconv2d_im2col_sgemm_pack16to4_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 16;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val = _mm512_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m512 _v1 = _val;
+                                __m512 _v2 = _val;
+                                __m512 _v3 = _val;
+                                __m512 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val = _mm512_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm512_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm512_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm512_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val = _mm512_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack16to4_avx512(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to8.h b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h
new file mode 100644
index 00000000000..d441d254940
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack16to8.h"
+
+static void deformableconv2d_im2col_sgemm_pack16to8_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 16;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m512 _val = _mm512_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m512 _v1 = _val;
+                                __m512 _v2 = _val;
+                                __m512 _v3 = _val;
+                                __m512 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m512 _w1 = _mm512_loadu_ps(w1_ptr);
+                                __m512 _w2 = _mm512_loadu_ps(w2_ptr);
+                                __m512 _w3 = _mm512_loadu_ps(w3_ptr);
+                                __m512 _w4 = _mm512_loadu_ps(w4_ptr);
+                                _val = _mm512_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm512_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm512_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm512_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m512 _mask = _mm512_loadu_ps(mask_ptr);
+                                _val = _mm512_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack16to8_avx512(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to16.h b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h
new file mode 100644
index 00000000000..d30c11926fd
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack1to16.h"
+
+static void deformableconv2d_im2col_sgemm_pack1to16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        int h_low = 0;
+                        int w_low = 0;
+                        int h_high = 0;
+                        int w_high = 0;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        if (cond)
+                        {
+                            h_low = floor(h_im);
+                            w_low = floor(w_im);
+                            h_high = h_low + 1;
+                            w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            float val = 0.f;
+                            if (cond)
+                            {
+                                float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f;
+                                float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f;
+                                float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f;
+                                float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f;
+                                val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                            }
+                            bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack1to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to4.h b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h
new file mode 100644
index 00000000000..0070999c05c
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack1to4.h"
+
+static void deformableconv2d_im2col_sgemm_pack1to4_sse(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        int h_low = 0;
+                        int w_low = 0;
+                        int h_high = 0;
+                        int w_high = 0;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        if (cond)
+                        {
+                            h_low = floor(h_im);
+                            w_low = floor(w_im);
+                            h_high = h_low + 1;
+                            w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            float val = 0.f;
+                            if (cond)
+                            {
+                                float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f;
+                                float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f;
+                                float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f;
+                                float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f;
+                                val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                            }
+                            bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack1to4_sse(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to8.h b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h
new file mode 100644
index 00000000000..d02c4245d7c
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack1to8.h"
+
+static void deformableconv2d_im2col_sgemm_pack1to8_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        int h_low = 0;
+                        int w_low = 0;
+                        int h_high = 0;
+                        int w_high = 0;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        if (cond)
+                        {
+                            h_low = floor(h_im);
+                            w_low = floor(w_im);
+                            h_high = h_low + 1;
+                            w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            float val = 0.f;
+                            if (cond)
+                            {
+                                float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f;
+                                float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f;
+                                float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f;
+                                float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f;
+                                val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                            }
+                            bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack1to8_avx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4.h b/src/layer/x86/deformableconv2d_sgemm_pack4.h
new file mode 100644
index 00000000000..140fa78e522
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack4.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack4.h"
+
+static void deformableconv2d_im2col_sgemm_pack4_sse(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 4;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val = _mm_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m128 _v1 = _val;
+                                __m128 _v2 = _val;
+                                __m128 _v3 = _val;
+                                __m128 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val = _mm_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val = _mm_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4_sse(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to1.h b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h
new file mode 100644
index 00000000000..d5d7b57cab5
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack4to1.h"
+
+static void deformableconv2d_im2col_sgemm_pack4to1_sse(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 4;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val = _mm_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m128 _v1 = _val;
+                                __m128 _v2 = _val;
+                                __m128 _v3 = _val;
+                                __m128 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val = _mm_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val = _mm_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4to1_sse(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to16.h b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h
new file mode 100644
index 00000000000..7eef68bb01a
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack4to16.h"
+
+static void deformableconv2d_im2col_sgemm_pack4to16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 4;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val = _mm_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m128 _v1 = _val;
+                                __m128 _v2 = _val;
+                                __m128 _v3 = _val;
+                                __m128 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val = _mm_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val = _mm_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to8.h b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h
new file mode 100644
index 00000000000..1096d5dc834
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack4to8.h"
+
+static void deformableconv2d_im2col_sgemm_pack4to8_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 4;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m128 _val = _mm_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m128 _v1 = _val;
+                                __m128 _v2 = _val;
+                                __m128 _v3 = _val;
+                                __m128 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m128 _w1 = _mm_loadu_ps(w1_ptr);
+                                __m128 _w2 = _mm_loadu_ps(w2_ptr);
+                                __m128 _w3 = _mm_loadu_ps(w3_ptr);
+                                __m128 _w4 = _mm_loadu_ps(w4_ptr);
+                                _val = _mm_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m128 _mask = _mm_loadu_ps(mask_ptr);
+                                _val = _mm_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4to8_avx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8.h b/src/layer/x86/deformableconv2d_sgemm_pack8.h
new file mode 100644
index 00000000000..fce55606859
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack8.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack8.h"
+
+static void deformableconv2d_im2col_sgemm_pack8_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 8;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val = _mm256_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m256 _v1 = _val;
+                                __m256 _v2 = _val;
+                                __m256 _v3 = _val;
+                                __m256 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val = _mm256_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm256_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm256_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm256_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val = _mm256_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8_avx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to1.h b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h
new file mode 100644
index 00000000000..635c08625ab
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack8to1.h"
+
+static void deformableconv2d_im2col_sgemm_pack8to1_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 8;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val = _mm256_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m256 _v1 = _val;
+                                __m256 _v2 = _val;
+                                __m256 _v3 = _val;
+                                __m256 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val = _mm256_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm256_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm256_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm256_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val = _mm256_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to1_avx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to16.h b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h
new file mode 100644
index 00000000000..161e983f1a0
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack8to16.h"
+
+static void deformableconv2d_im2col_sgemm_pack8to16_avx512(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 8;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val = _mm256_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m256 _v1 = _val;
+                                __m256 _v2 = _val;
+                                __m256 _v3 = _val;
+                                __m256 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val = _mm256_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm256_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm256_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm256_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val = _mm256_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to4.h b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h
new file mode 100644
index 00000000000..45c853d2262
--- /dev/null
+++ b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h
@@ -0,0 +1,180 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_sgemm_pack8to4.h"
+
+static void deformableconv2d_im2col_sgemm_pack8to4_avx(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+    const int elempack = 8;
+    const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+    const float* zeros_ptr = zeros;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int h_col = 0; h_col < outh; h_col++)
+        {
+            for (int w_col = 0; w_col < outw; w_col++)
+            {
+                int h_in = h_col * stride_h - pad_top;
+                int w_in = w_col * stride_w - pad_left;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = floor(h_im);
+                            int w_low = floor(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+                        const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1};
+                        const float* w1_ptr = w1s;
+                        const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2};
+                        const float* w2_ptr = w2s;
+                        const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3};
+                        const float* w3_ptr = w3s;
+                        const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4};
+                        const float* w4_ptr = w4s;
+                        const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_};
+                        const float* mask_ptr = masks;
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            __m256 _val = _mm256_loadu_ps(zeros_ptr);
+                            if (cond)
+                            {
+                                __m256 _v1 = _val;
+                                __m256 _v2 = _val;
+                                __m256 _v3 = _val;
+                                __m256 _v4 = _val;
+                                if (v1_cond)
+                                    _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack);
+                                if (v2_cond)
+                                    _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack);
+                                if (v3_cond)
+                                    _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack);
+                                if (v4_cond)
+                                    _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack);
+                                __m256 _w1 = _mm256_loadu_ps(w1_ptr);
+                                __m256 _w2 = _mm256_loadu_ps(w2_ptr);
+                                __m256 _w3 = _mm256_loadu_ps(w3_ptr);
+                                __m256 _w4 = _mm256_loadu_ps(w4_ptr);
+                                _val = _mm256_comp_fmadd_ps(_v1, _w1, _val);
+                                _val = _mm256_comp_fmadd_ps(_v2, _w2, _val);
+                                _val = _mm256_comp_fmadd_ps(_v3, _w3, _val);
+                                _val = _mm256_comp_fmadd_ps(_v4, _w4, _val);
+                            }
+                            if (has_mask)
+                            {
+                                __m256 _mask = _mm256_loadu_ps(mask_ptr);
+                                _val = _mm256_mul_ps(_val, _mask);
+                            }
+                            float* ptr = bottom_im2col.channel(ic);
+                            _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to4_avx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp
index 869815283d9..caff2e17d06 100644
--- a/src/layer/x86/deformableconv2d_x86.cpp
+++ b/src/layer/x86/deformableconv2d_x86.cpp
@@ -14,63 +14,352 @@
 
 #include "deformableconv2d_x86.h"
 
+#if __SSE2__
+#include <emmintrin.h>
+#if __SSE4_1__
+#include <smmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif // __SSE4_1__
+#endif // __SSE2__
+#include "x86_activation.h"
+#include "x86_usability.h"
+
+#include "benchmark.h"
+#include "cpu.h"
 #include "layer_type.h"
 
 namespace ncnn {
 
+#include "deformableconv2d_sgemm.h"
+
+#if __SSE2__
+#include "deformableconv2d_pack4.h"
+#include "deformableconv2d_pack1to4.h"
+#include "deformableconv2d_pack4to1.h"
+
+#include "deformableconv2d_sgemm_pack4.h"
+#include "deformableconv2d_sgemm_pack1to4.h"
+#include "deformableconv2d_sgemm_pack4to1.h"
+
+#if __AVX__
+#include "deformableconv2d_pack8.h"
+#include "deformableconv2d_pack4to8.h"
+#include "deformableconv2d_pack1to8.h"
+#include "deformableconv2d_pack8to4.h"
+#include "deformableconv2d_pack8to1.h"
+
+#include "deformableconv2d_sgemm_pack8.h"
+#include "deformableconv2d_sgemm_pack4to8.h"
+#include "deformableconv2d_sgemm_pack1to8.h"
+#include "deformableconv2d_sgemm_pack8to4.h"
+#include "deformableconv2d_sgemm_pack8to1.h"
+
+#if __AVX512F__
+#include "deformableconv2d_pack16.h"
+#include "deformableconv2d_pack8to16.h"
+#include "deformableconv2d_pack4to16.h"
+#include "deformableconv2d_pack1to16.h"
+#include "deformableconv2d_pack16to8.h"
+#include "deformableconv2d_pack16to4.h"
+#include "deformableconv2d_pack16to1.h"
+
+#include "deformableconv2d_sgemm_pack16.h"
+#include "deformableconv2d_sgemm_pack8to16.h"
+#include "deformableconv2d_sgemm_pack4to16.h"
+#include "deformableconv2d_sgemm_pack1to16.h"
+#include "deformableconv2d_sgemm_pack16to8.h"
+#include "deformableconv2d_sgemm_pack16to4.h"
+#include "deformableconv2d_sgemm_pack16to1.h"
+#endif // __AVX512F__
+#endif // __AVX__
+#endif // __SSE2__
+
 DeformableConv2D_x86::DeformableConv2D_x86()
 {
-    one_blob_only = false;
-    support_inplace = false;
+#if __SSE2__
+    support_packing = true;
+#endif // __SSE2__
 
-    inner_product = 0;
-    permute = 0;
+    activation = 0;
 }
 
-int DeformableConv2D_x86::create_pipeline(const Option& opt)
+static int _4Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int l1, int l2, int l3)
+{
+    return ((i0 * l1 + i1) * l2 + i2) * l3 + i3;
+}
+
+static int _6Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int i4, int i5, int l1, int l2, int l3, int l4, int l5)
+{
+    return ((((i0 * l1 + i1) * l2 + i2) * l3 + i3) * l4 + i4) * l5 + i5;
+}
+
+static void deformableconv2d_transform_kernel_packed_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
 {
-    const int in_c = weight_data_size / (num_output * kernel_h * kernel_w);
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-inch/pa-kw-kh-outch/pb
     {
-        Mat weight_3d = weight_data.reshape(kernel_w * kernel_h, in_c, num_output);
-        weight_data_t.create(in_c, kernel_w * kernel_h, num_output);
-        if (weight_data_t.empty())
-            return -100;
-        for (int q = 0; q < num_output; q++)
-        {
-            const Mat m = weight_3d.channel(q);
-            float* outptr = weight_data_t.channel(q);
+        const float* weight_ptr = weight_data;
 
-            for (int i = 0; i < kernel_w * kernel_h; i++)
+        weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+        float* ptr = weight_data_tm;
+        for (int oc = 0; oc < num_output; oc++)
+        {
+            for (int i = 0; i < kernel_h; i++)
             {
-                for (int j = 0; j < in_c; j++)
+                for (int j = 0; j < kernel_w; j++)
                 {
-                    *outptr++ = m.row(j)[i];
+                    for (int ic = 0; ic < num_input; ic++)
+                    {
+                        ptr[_6Dindex_to_1Dindex(oc / out_elempack, i, j, ic / elempack, ic % elempack, oc % out_elempack, kernel_h, kernel_w, num_input / elempack, elempack, out_elempack)] = weight_ptr[_4Dindex_to_1Dindex(oc, ic, i, j, num_input, kernel_h, kernel_w)];
+                    }
                 }
             }
         }
-        weight_3d.release();
-        weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output);
+        weight_data_tm = weight_data_tm.reshape(num_input / elempack, maxk, num_output / out_elempack);
+    }
+}
+
+int DeformableConv2D_x86::create_pipeline(const Option& opt)
+{
+    activation = create_activation_layer(activation_type, activation_params, opt);
+
+    int kernel_size = kernel_w * kernel_h;
+    int num_input = weight_data_size / kernel_size / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+
+#if __SSE2__
+    if (opt.use_packing_layout)
+    {
+#if __AVX512F__
+        elempack = num_input % 16 == 0 ? 16 : num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+#elif __AVX__
+        elempack = num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+#else
+        elempack = num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+#endif
+    }
+#endif // __SSE2__
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    if (elempack == 16 && out_elempack == 16)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 8 && out_elempack == 16)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 16 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack16to8_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 16)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 16 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack16to4_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 16)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 16 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack16to1_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+#endif // __AVX512F__
+
+    // pack8
+    if (elempack == 8 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack4to8
+    if (elempack == 4 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack1to8
+    if (elempack == 1 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack8to4
+    if (elempack == 8 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack8to1
+    if (elempack == 8 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+#endif // __AVX__
+
+    // pack4
+    if (elempack == 4 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack1to4
+    if (elempack == 1 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack4to1
+    if (elempack == 4 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to1_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+#endif // __SSE2__
 
-        inner_product = ncnn::create_layer(ncnn::LayerType::InnerProduct);
-        ncnn::ParamDict pd;
-        pd.set(0, num_output);
-        pd.set(1, bias_term);
-        pd.set(2, weight_data_size);
-        pd.set(9, activation_type);
-        pd.set(10, activation_params);
-        inner_product->load_param(pd);
-        ncnn::Mat weights[2];
-        weights[0] = weight_data_t;
-        if (bias_term)
-            weights[1] = bias_data;
-        inner_product->load_model(ncnn::ModelBinFromMatArray(weights));
-        inner_product->create_pipeline(opt);
+    // pack1
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            weight_data_tm = weight_data;
+        }
+    }
 
-        permute = ncnn::create_layer(ncnn::LayerType::Permute);
-        ncnn::ParamDict permute_pd;
-        permute_pd.set(0, 1);
-        permute->load_param(permute_pd);
-        permute->create_pipeline(opt);
+    if (opt.lightmode)
+    {
+        weight_data.release();
     }
 
     return 0;
@@ -78,17 +367,11 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
 
 int DeformableConv2D_x86::destroy_pipeline(const Option& opt)
 {
-    if (inner_product)
+    if (activation)
     {
-        inner_product->destroy_pipeline(opt);
-        delete inner_product;
-        inner_product = 0;
-    }
-    if (permute)
-    {
-        permute->destroy_pipeline(opt);
-        delete permute;
-        permute = 0;
+        activation->destroy_pipeline(opt);
+        delete activation;
+        activation = 0;
     }
 
     return 0;
@@ -98,134 +381,427 @@ int DeformableConv2D_x86::forward(const std::vector<Mat>& bottom_blobs, std::vec
 {
     const Mat& bottom_blob = bottom_blobs[0];
     const Mat& offset = bottom_blobs[1];
-
     const bool has_mask = (bottom_blobs.size() == 3);
+    Mat& top_blob = top_blobs[0];
 
-    const int w = bottom_blob.w;
-    const int h = bottom_blob.h;
-    const int in_c = bottom_blob.c;
-    const size_t elemsize = bottom_blob.elemsize;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
-
     const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1;
     const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1;
 
-    // output = im2col matmul weight_t, im2col.shape is [out_h * out_w, kernel_h * kernel_w * in_c] (in python),
-    // weight_t.shape is [num_output, kernel_h * kernel_w * in_c] (in python),
-    // output.shape   is [out_h * out_w, num_output] (in python).
-    Mat im2col;
-    im2col.create(kernel_h * kernel_w * in_c * out_h * out_w, elemsize, opt.blob_allocator);
-    if (im2col.empty())
-        return -100;
+    int out_elempack = 1;
+#if __SSE2__
+    if (opt.use_packing_layout)
+    {
+#if __AVX512F__
+        out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+#elif __AVX__
+        out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+#else
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+#endif
+    }
+#endif // __SSE2__
+    size_t out_elemsize = elemsize / elempack * out_elempack;
 
-    Mat& output = top_blobs[0];
-    output.create(num_output, out_h * out_w, elemsize, opt.blob_allocator);
-    if (output.empty())
+    top_blob.create(out_w, out_h, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
         return -100;
 
-    Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c);
-    Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c);
-    const float* data_im_ptr = bottom_blob_flatten;
-    const float* data_offset_ptr = offset_flatten;
-    float* im2col_ptr = im2col;
+    const int num_input = channels * elempack;
 
-    // im2col
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int h_col = 0; h_col < out_h; h_col++)
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    if (elempack == 16 && out_elempack == 16)
     {
-        for (int w_col = 0; w_col < out_w; w_col++)
+        if (opt.use_sgemm_convolution)
         {
-            int h_in = h_col * stride_h - pad_top;
-            int w_in = w_col * stride_w - pad_left;
-            float* data_col_ptr = im2col_ptr + (h_col * out_w + w_col) * kernel_h * kernel_w * in_c;
-            for (int i = 0; i < kernel_h; i++)
+            deformableconv2d_im2col_sgemm_pack16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
             {
-                for (int j = 0; j < kernel_w; j++)
-                {
-                    const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col;
-                    const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col;
-
-                    const float offset_h = data_offset_ptr[data_offset_h_ptr];
-                    const float offset_w = data_offset_ptr[data_offset_w_ptr];
-                    const float mask_ = has_mask ? bottom_blobs[2].channel(i * kernel_w + j).row(h_col)[w_col] : 1.f;
-                    const float h_im = h_in + i * dilation_h + offset_h;
-                    const float w_im = w_in + j * dilation_w + offset_w;
-
-                    // Bilinear
-                    const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
-                    float w1 = 0.f;
-                    float w2 = 0.f;
-                    float w3 = 0.f;
-                    float w4 = 0.f;
-                    bool v1_cond = false;
-                    bool v2_cond = false;
-                    bool v3_cond = false;
-                    bool v4_cond = false;
-                    int v1_pos = 0;
-                    int v2_pos = 0;
-                    int v3_pos = 0;
-                    int v4_pos = 0;
-                    if (cond)
-                    {
-                        int h_low = floor(h_im);
-                        int w_low = floor(w_im);
-                        int h_high = h_low + 1;
-                        int w_high = w_low + 1;
-
-                        float lh = h_im - h_low;
-                        float lw = w_im - w_low;
-                        float hh = 1 - lh;
-                        float hw = 1 - lw;
-
-                        v1_cond = (h_low >= 0 && w_low >= 0);
-                        v2_cond = (h_low >= 0 && w_high <= w - 1);
-                        v3_cond = (h_high <= h - 1 && w_low >= 0);
-                        v4_cond = (h_high <= h - 1 && w_high <= w - 1);
-                        if (v1_cond)
-                            v1_pos = h_low * w + w_low;
-                        if (v2_cond)
-                            v2_pos = h_low * w + w_high;
-                        if (v3_cond)
-                            v3_pos = h_high * w + w_low;
-                        if (v4_cond)
-                            v4_pos = h_high * w + w_high;
-
-                        w1 = hh * hw;
-                        w2 = hh * lw;
-                        w3 = lh * hw;
-                        w4 = lh * lw;
-                    }
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 8 && out_elempack == 16)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack8to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack8to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 16 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack16to8_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack16to8_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 16)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack4to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack4to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 16 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack16to4_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack16to4_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 16)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack1to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack1to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 16 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack16to1_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack16to1_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+#endif // __AVX512F__
+
+    if (elempack == 8 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack1to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack1to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 8)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack4to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack4to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 8 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack8to1_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack8to1_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 8 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack8to4_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack8to4_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+#endif // __AVX__
+
+    if (elempack == 4 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack1to4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack1to4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_pack4to1_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            deformableconv2d_pack4to1_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+    }
+#endif // __SSE2__
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (opt.use_sgemm_convolution)
+        {
+            deformableconv2d_im2col_sgemm_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            const bool offset_not_pack = offset.elempack == 1;
+            const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+            const float* weight_ptr = weight_data_tm;
 
-                    const float* data_im_channel_ptr = data_im_ptr;
-                    for (int c_im = 0; c_im < in_c; c_im++)
+            // naive deformable conv
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int h_col = 0; h_col < out_h; h_col++)
+            {
+                for (int w_col = 0; w_col < out_w; w_col++)
+                {
+                    int h_in = h_col * stride_h - pad_top;
+                    int w_in = w_col * stride_w - pad_left;
+                    for (int oc = 0; oc < num_output; oc++)
                     {
-                        float val = 0.f;
-                        if (cond)
+                        float sum = 0.f;
+                        if (bias_term)
+                            sum = bias_data[oc];
+                        for (int i = 0; i < kernel_h; i++)
                         {
-                            float v1 = v1_cond ? data_im_channel_ptr[v1_pos] : 0.f;
-                            float v2 = v2_cond ? data_im_channel_ptr[v2_pos] : 0.f;
-                            float v3 = v3_cond ? data_im_channel_ptr[v3_pos] : 0.f;
-                            float v4 = v4_cond ? data_im_channel_ptr[v4_pos] : 0.f;
-                            val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                            for (int j = 0; j < kernel_w; j++)
+                            {
+                                float offset_h = 0.f;
+                                float offset_w = 0.f;
+                                float mask_ = 1.f;
+                                if (offset_not_pack)
+                                {
+                                    offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                                    offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                                }
+                                else
+                                {
+                                    const int y_c = (i * kernel_w + j) * 2;
+                                    const int x_c = (i * kernel_w + j) * 2 + 1;
+                                    offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                                    offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                                }
+                                if (has_mask)
+                                {
+                                    const Mat& mask = bottom_blobs[2];
+                                    if (mask_not_pack)
+                                    {
+                                        mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                                    }
+                                    else
+                                    {
+                                        const int m_c = i * kernel_w + j;
+                                        mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                                    }
+                                }
+                                const float h_im = h_in + i * dilation_h + offset_h;
+                                const float w_im = w_in + j * dilation_w + offset_w;
+
+                                // Bilinear
+                                const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                                int h_low = 0;
+                                int w_low = 0;
+                                int h_high = 0;
+                                int w_high = 0;
+                                float w1 = 0.f;
+                                float w2 = 0.f;
+                                float w3 = 0.f;
+                                float w4 = 0.f;
+                                bool v1_cond = false;
+                                bool v2_cond = false;
+                                bool v3_cond = false;
+                                bool v4_cond = false;
+                                if (cond)
+                                {
+                                    h_low = floor(h_im);
+                                    w_low = floor(w_im);
+                                    h_high = h_low + 1;
+                                    w_high = w_low + 1;
+
+                                    float lh = h_im - h_low;
+                                    float lw = w_im - w_low;
+                                    float hh = 1 - lh;
+                                    float hw = 1 - lw;
+
+                                    v1_cond = (h_low >= 0 && w_low >= 0);
+                                    v2_cond = (h_low >= 0 && w_high <= w - 1);
+                                    v3_cond = (h_high <= h - 1 && w_low >= 0);
+                                    v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+
+                                    w1 = hh * hw;
+                                    w2 = hh * lw;
+                                    w3 = lh * hw;
+                                    w4 = lh * lw;
+                                }
+
+                                for (int ic = 0; ic < channels; ic++)
+                                {
+                                    float val = 0.f;
+                                    if (cond)
+                                    {
+                                        float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f;
+                                        float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f;
+                                        float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f;
+                                        float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f;
+                                        val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                                    }
+                                    sum += val * mask_ * weight_ptr[((oc * channels + ic) * kernel_h + i) * kernel_w + j];
+                                }
+                            }
                         }
-                        *data_col_ptr = val * mask_;
-                        data_col_ptr += 1;
-                        data_im_channel_ptr += h * w;
+                        top_blob.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params);
                     }
                 }
             }
         }
     }
-    im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w);
-    // call InnerProduct
-    inner_product->forward(im2col, output, opt);
-    ncnn::Mat output_t;
-    // call Permute
-    permute->forward(output, output_t, opt);
-    output_t = output_t.reshape(out_w, out_h, num_output);
-    top_blobs[0] = output_t;
+
     return 0;
 }
 
diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h
index 0e21c9392af..a4f02f8fccb 100644
--- a/src/layer/x86/deformableconv2d_x86.h
+++ b/src/layer/x86/deformableconv2d_x86.h
@@ -30,10 +30,10 @@ class DeformableConv2D_x86 : virtual public DeformableConv2D
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
-    Mat weight_data_t;
+    Layer* activation;
 
-    Layer* inner_product;
-    Layer* permute;
+    Mat weight_data_tm;
+    Mat weight_sgemm_data;
 };
 
 } // namespace ncnn
diff --git a/src/layer/x86/flatten_x86.cpp b/src/layer/x86/flatten_x86.cpp
index 19e663197a5..7c2ae662d6a 100644
--- a/src/layer/x86/flatten_x86.cpp
+++ b/src/layer/x86/flatten_x86.cpp
@@ -141,7 +141,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _re = _mm512_loadu_ps(ptr + 16 * 14);
                     __m512 _rf = _mm512_loadu_ps(ptr + 16 * 15);
 
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
 
                     _mm512_storeu_ps(outptr0, _r0);
                     _mm512_storeu_ps(outptr1, _r1);
@@ -230,7 +230,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row6 = _mm256_loadu_ps(ptr + 48);
                     __m256 _row7 = _mm256_loadu_ps(ptr + 56);
 
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
 
                     _mm256_storeu_ps(outptr0, _row0);
                     _mm256_storeu_ps(outptr1, _row1);
@@ -362,7 +362,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _re = _mm512_loadu_ps(ptr + 16 * 14);
                     __m512 _rf = _mm512_loadu_ps(ptr + 16 * 15);
 
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
 
                     _mm512_storeu_ps(outptr0, _r0);
                     _mm512_storeu_ps(outptr1, _r1);
@@ -451,7 +451,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row6 = _mm256_loadu_ps(ptr + 48);
                     __m256 _row7 = _mm256_loadu_ps(ptr + 56);
 
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
 
                     _mm256_storeu_ps(outptr0, _row0);
                     _mm256_storeu_ps(outptr1, _row1);
diff --git a/src/layer/x86/gelu_x86.cpp b/src/layer/x86/gelu_x86.cpp
new file mode 100644
index 00000000000..352d330b877
--- /dev/null
+++ b/src/layer/x86/gelu_x86.cpp
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "gelu_x86.h"
+
+#if __SSE2__
+#include <emmintrin.h>
+#include "sse_mathfun.h"
+#if __AVX__
+#include <immintrin.h>
+#include "avx_mathfun.h"
+#if __AVX512F__
+#include "avx512_mathfun.h"
+#endif // __AVX512F__
+#endif // __AVX__
+#endif // __SSE2__
+
+namespace ncnn {
+
+GELU_x86::GELU_x86()
+{
+#if __SSE2__
+    support_packing = true;
+#endif // __SSE2__
+}
+
+int GELU_x86::create_pipeline(const Option& /*opt*/)
+{
+    if (!fast_gelu)
+    {
+        support_packing = false;
+    }
+    return 0;
+}
+
+int GELU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    if (!fast_gelu)
+    {
+        return GELU::forward_inplace(bottom_top_blob, opt);
+    }
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int elempack = bottom_top_blob.elempack;
+    int channels = bottom_top_blob.c;
+    int size = w * h * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        __m512 _half512 = _mm512_set1_ps(0.5f);
+        __m512 _one512 = _mm512_set1_ps(1.f);
+        __m512 _fast1c512 = _mm512_set1_ps(0.79788452f);
+        __m512 _fast2c512 = _mm512_set1_ps(0.044715f);
+        for (; i + 15 < size; i += 16)
+        {
+            __m512 _pLoad = _mm512_loadu_ps(ptr);
+
+            __m512 _cube = _mm512_mul_ps(_pLoad, _pLoad);
+            _cube = _mm512_mul_ps(_pLoad, _cube);
+
+            __m512 _blob = _mm512_mul_ps(_fast2c512, _cube);
+            _blob = _mm512_add_ps(_pLoad, _blob);
+            _blob = _mm512_mul_ps(_fast1c512, _blob);
+            _blob = tanh512_ps(_blob);
+            _blob = _mm512_add_ps(_one512, _blob);
+
+            _blob = _mm512_mul_ps(_half512, _mm512_mul_ps(_blob, _pLoad));
+
+            _mm512_storeu_ps(ptr, _blob);
+
+            ptr += 16;
+        }
+#endif // __AVX512F__
+        __m256 _half256 = _mm256_set1_ps(0.5f);
+        __m256 _one256 = _mm256_set1_ps(1.f);
+        __m256 _fast1c256 = _mm256_set1_ps(0.79788452f);
+        __m256 _fast2c256 = _mm256_set1_ps(0.044715f);
+        for (; i + 7 < size; i += 8)
+        {
+            __m256 _pLoad = _mm256_loadu_ps(ptr);
+
+            __m256 _cube = _mm256_mul_ps(_pLoad, _pLoad);
+            _cube = _mm256_mul_ps(_pLoad, _cube);
+
+            __m256 _blob = _mm256_mul_ps(_fast2c256, _cube);
+            _blob = _mm256_add_ps(_pLoad, _blob);
+            _blob = _mm256_mul_ps(_fast1c256, _blob);
+            _blob = tanh256_ps(_blob);
+            _blob = _mm256_add_ps(_one256, _blob);
+
+            _blob = _mm256_mul_ps(_half256, _mm256_mul_ps(_blob, _pLoad));
+
+            _mm256_storeu_ps(ptr, _blob);
+
+            ptr += 8;
+        }
+#endif // __AVX__
+        __m128 _half128 = _mm_set1_ps(0.5f);
+        __m128 _one128 = _mm_set1_ps(1.f);
+        __m128 _fast1c128 = _mm_set1_ps(0.79788452f);
+        __m128 _fast2c128 = _mm_set1_ps(0.044715f);
+        for (; i + 3 < size; i += 4)
+        {
+            __m128 _pLoad = _mm_loadu_ps(ptr);
+
+            __m128 _cube = _mm_mul_ps(_pLoad, _pLoad);
+            _cube = _mm_mul_ps(_pLoad, _cube);
+
+            __m128 _blob = _mm_mul_ps(_fast2c128, _cube);
+            _blob = _mm_add_ps(_pLoad, _blob);
+            _blob = _mm_mul_ps(_fast1c128, _blob);
+            _blob = tanh_ps(_blob);
+            _blob = _mm_add_ps(_one128, _blob);
+
+            _blob = _mm_mul_ps(_half128, _mm_mul_ps(_blob, _pLoad));
+
+            _mm_storeu_ps(ptr, _blob);
+
+            ptr += 4;
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
+            *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr)));
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/gelu_x86.h b/src/layer/x86/gelu_x86.h
new file mode 100644
index 00000000000..75d821bfd45
--- /dev/null
+++ b/src/layer/x86/gelu_x86.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_GELU_X86_H
+#define LAYER_GELU_X86_H
+
+#include "gelu.h"
+
+namespace ncnn {
+
+class GELU_x86 : virtual public GELU
+{
+public:
+    GELU_x86();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GELU_X86_H
diff --git a/src/layer/x86/innerproduct_fp.h b/src/layer/x86/innerproduct_fp.h
new file mode 100644
index 00000000000..6edcd74dcd9
--- /dev/null
+++ b/src/layer/x86/innerproduct_fp.h
@@ -0,0 +1,1401 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
+void innerproduct_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
+void innerproduct_transform_kernel_fp16s_sse_f16c(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
+#endif
+
+#if NCNN_IMPL_FP16S
+static void innerproduct_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
+#else
+static void innerproduct_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
+#endif
+{
+#if NCNN_RUNTIME_CPU && NCNN_IMPL_FP16S && NCNN_F16C && __AVX__ && !__F16C__
+    if (ncnn::cpu_support_x86_f16c())
+    {
+        innerproduct_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
+        return;
+    }
+#else // NCNN_RUNTIME_CPU
+
+    const int num_input = bottom_blob.w * bottom_blob.elempack;
+    const int outw = top_blob.w;
+    const int out_elempack = top_blob.elempack;
+
+    const float* bias_data_ptr = bias_data;
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    if (out_elempack == 16)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outw; p++)
+        {
+            __m512 _sum0 = _mm512_setzero_ps();
+            __m512 _sum1 = _mm512_setzero_ps();
+            __m512 _sum2 = _mm512_setzero_ps();
+            __m512 _sum3 = _mm512_setzero_ps();
+            __m512 _sum4 = _mm512_setzero_ps();
+            __m512 _sum5 = _mm512_setzero_ps();
+            __m512 _sum6 = _mm512_setzero_ps();
+            __m512 _sum7 = _mm512_setzero_ps();
+
+            if (bias_data_ptr)
+            {
+                _sum0 = _mm512_loadu_ps(bias_data_ptr + p * 16);
+            }
+
+#if NCNN_IMPL_FP16S
+            const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+            const float* kptr = weight_data_tm.row(p);
+#endif
+            const float* sptr = bottom_blob;
+
+            int i = 0;
+            for (; i + 7 < num_input; i += 8)
+            {
+                __m512 _val0 = _mm512_set1_ps(sptr[0]);
+                __m512 _val1 = _mm512_set1_ps(sptr[1]);
+                __m512 _val2 = _mm512_set1_ps(sptr[2]);
+                __m512 _val3 = _mm512_set1_ps(sptr[3]);
+#if NCNN_IMPL_FP16S
+                __m512i _w01 = _mm512_loadu_si512(kptr);
+                __m512i _w23 = _mm512_loadu_si512(kptr + 32);
+                __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0));
+                __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1));
+                __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0));
+                __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1));
+#else
+                __m512 _w0 = _mm512_loadu_ps(kptr + 16 * 0);
+                __m512 _w1 = _mm512_loadu_ps(kptr + 16 * 1);
+                __m512 _w2 = _mm512_loadu_ps(kptr + 16 * 2);
+                __m512 _w3 = _mm512_loadu_ps(kptr + 16 * 3);
+#endif
+
+                _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0);
+                _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1);
+                _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2);
+                _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3);
+
+                __m512 _val4 = _mm512_set1_ps(sptr[4]);
+                __m512 _val5 = _mm512_set1_ps(sptr[5]);
+                __m512 _val6 = _mm512_set1_ps(sptr[6]);
+                __m512 _val7 = _mm512_set1_ps(sptr[7]);
+#if NCNN_IMPL_FP16S
+                __m512i _w45 = _mm512_loadu_si512(kptr + 64);
+                __m512i _w67 = _mm512_loadu_si512(kptr + 96);
+                __m512 _w4 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 0));
+                __m512 _w5 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 1));
+                __m512 _w6 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 0));
+                __m512 _w7 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 1));
+#else
+                __m512 _w4 = _mm512_loadu_ps(kptr + 16 * 4);
+                __m512 _w5 = _mm512_loadu_ps(kptr + 16 * 5);
+                __m512 _w6 = _mm512_loadu_ps(kptr + 16 * 6);
+                __m512 _w7 = _mm512_loadu_ps(kptr + 16 * 7);
+#endif
+
+                _sum4 = _mm512_fmadd_ps(_val4, _w4, _sum4);
+                _sum5 = _mm512_fmadd_ps(_val5, _w5, _sum5);
+                _sum6 = _mm512_fmadd_ps(_val6, _w6, _sum6);
+                _sum7 = _mm512_fmadd_ps(_val7, _w7, _sum7);
+
+                sptr += 8;
+                kptr += 128;
+            }
+            for (; i + 3 < num_input; i += 4)
+            {
+                __m512 _val0 = _mm512_set1_ps(sptr[0]);
+                __m512 _val1 = _mm512_set1_ps(sptr[1]);
+                __m512 _val2 = _mm512_set1_ps(sptr[2]);
+                __m512 _val3 = _mm512_set1_ps(sptr[3]);
+#if NCNN_IMPL_FP16S
+                __m512i _w01 = _mm512_loadu_si512(kptr);
+                __m512i _w23 = _mm512_loadu_si512(kptr + 32);
+                __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0));
+                __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1));
+                __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0));
+                __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1));
+#else
+                __m512 _w0 = _mm512_loadu_ps(kptr);
+                __m512 _w1 = _mm512_loadu_ps(kptr + 16);
+                __m512 _w2 = _mm512_loadu_ps(kptr + 32);
+                __m512 _w3 = _mm512_loadu_ps(kptr + 48);
+#endif
+
+                _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0);
+                _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1);
+                _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2);
+                _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3);
+
+                sptr += 4;
+                kptr += 64;
+            }
+            for (; i < num_input; i++)
+            {
+                __m512 _val = _mm512_set1_ps(sptr[0]);
+#if NCNN_IMPL_FP16S
+                __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr));
+#else
+                __m512 _w = _mm512_loadu_ps(kptr);
+#endif
+                _sum0 = _mm512_fmadd_ps(_val, _w, _sum0);
+
+                sptr += 1;
+                kptr += 16;
+            }
+
+            _sum0 = _mm512_add_ps(_sum0, _sum1);
+            _sum2 = _mm512_add_ps(_sum2, _sum3);
+            _sum4 = _mm512_add_ps(_sum4, _sum5);
+            _sum6 = _mm512_add_ps(_sum6, _sum7);
+            _sum0 = _mm512_add_ps(_sum0, _sum2);
+            _sum4 = _mm512_add_ps(_sum4, _sum6);
+            _sum0 = _mm512_add_ps(_sum0, _sum4);
+
+            _sum0 = activation_avx512(_sum0, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            _mm512_storeu_ps(outptr + p * 16, _sum0);
+        }
+    }
+#endif // __AVX512F__
+
+    if (out_elempack == 8)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outw; p++)
+        {
+            __m256 _sum0 = _mm256_setzero_ps();
+            __m256 _sum1 = _mm256_setzero_ps();
+            __m256 _sum2 = _mm256_setzero_ps();
+            __m256 _sum3 = _mm256_setzero_ps();
+            __m256 _sum4 = _mm256_setzero_ps();
+            __m256 _sum5 = _mm256_setzero_ps();
+            __m256 _sum6 = _mm256_setzero_ps();
+            __m256 _sum7 = _mm256_setzero_ps();
+
+            if (bias_data_ptr)
+            {
+                _sum0 = _mm256_loadu_ps(bias_data_ptr + p * 8);
+            }
+
+#if NCNN_IMPL_FP16S
+            const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+            const float* kptr = weight_data_tm.row(p);
+#endif
+            const float* sptr = bottom_blob;
+
+            int i = 0;
+            for (; i + 7 < num_input; i += 8)
+            {
+                __m256 _val0 = _mm256_broadcast_ss(sptr);
+                __m256 _val1 = _mm256_broadcast_ss(sptr + 1);
+                __m256 _val2 = _mm256_broadcast_ss(sptr + 2);
+                __m256 _val3 = _mm256_broadcast_ss(sptr + 3);
+#if NCNN_IMPL_FP16S
+                __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr);
+                __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
+                __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0));
+                __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1));
+                __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0));
+                __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1));
+#else
+                __m256 _w0 = _mm256_loadu_ps(kptr);
+                __m256 _w1 = _mm256_loadu_ps(kptr + 8);
+                __m256 _w2 = _mm256_loadu_ps(kptr + 16);
+                __m256 _w3 = _mm256_loadu_ps(kptr + 24);
+#endif
+
+                _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
+                _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
+                _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
+                _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
+
+                __m256 _val4 = _mm256_broadcast_ss(sptr + 4);
+                __m256 _val5 = _mm256_broadcast_ss(sptr + 5);
+                __m256 _val6 = _mm256_broadcast_ss(sptr + 6);
+                __m256 _val7 = _mm256_broadcast_ss(sptr + 7);
+#if NCNN_IMPL_FP16S
+                __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32));
+                __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48));
+                __m256 _w4 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 0));
+                __m256 _w5 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 1));
+                __m256 _w6 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 0));
+                __m256 _w7 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 1));
+#else
+                __m256 _w4 = _mm256_loadu_ps(kptr + 32);
+                __m256 _w5 = _mm256_loadu_ps(kptr + 40);
+                __m256 _w6 = _mm256_loadu_ps(kptr + 48);
+                __m256 _w7 = _mm256_loadu_ps(kptr + 56);
+#endif
+
+                _sum4 = _mm256_comp_fmadd_ps(_val4, _w4, _sum4);
+                _sum5 = _mm256_comp_fmadd_ps(_val5, _w5, _sum5);
+                _sum6 = _mm256_comp_fmadd_ps(_val6, _w6, _sum6);
+                _sum7 = _mm256_comp_fmadd_ps(_val7, _w7, _sum7);
+
+                sptr += 8;
+                kptr += 64;
+            }
+            for (; i + 3 < num_input; i += 4)
+            {
+                __m256 _val0 = _mm256_broadcast_ss(sptr);
+                __m256 _val1 = _mm256_broadcast_ss(sptr + 1);
+                __m256 _val2 = _mm256_broadcast_ss(sptr + 2);
+                __m256 _val3 = _mm256_broadcast_ss(sptr + 3);
+#if NCNN_IMPL_FP16S
+                __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr);
+                __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
+                __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0));
+                __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1));
+                __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0));
+                __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1));
+#else
+                __m256 _w0 = _mm256_loadu_ps(kptr);
+                __m256 _w1 = _mm256_loadu_ps(kptr + 8);
+                __m256 _w2 = _mm256_loadu_ps(kptr + 16);
+                __m256 _w3 = _mm256_loadu_ps(kptr + 24);
+#endif
+
+                _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
+                _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
+                _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
+                _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
+
+                sptr += 4;
+                kptr += 32;
+            }
+            for (; i < num_input; i++)
+            {
+                __m256 _val = _mm256_set1_ps(sptr[0]);
+#if NCNN_IMPL_FP16S
+                __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr));
+#else
+                __m256 _w = _mm256_loadu_ps(kptr);
+#endif
+                _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0);
+
+                sptr += 1;
+                kptr += 8;
+            }
+
+            _sum0 = _mm256_add_ps(_sum0, _sum1);
+            _sum2 = _mm256_add_ps(_sum2, _sum3);
+            _sum4 = _mm256_add_ps(_sum4, _sum5);
+            _sum6 = _mm256_add_ps(_sum6, _sum7);
+            _sum0 = _mm256_add_ps(_sum0, _sum2);
+            _sum4 = _mm256_add_ps(_sum4, _sum6);
+            _sum0 = _mm256_add_ps(_sum0, _sum4);
+
+            _sum0 = activation_avx(_sum0, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            _mm256_storeu_ps(outptr + p * 8, _sum0);
+        }
+    }
+#endif // __AVX__
+
+    if (out_elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outw; p++)
+        {
+            __m128 _sum0 = _mm_setzero_ps();
+#if __AVX__
+            __m256 _sum01 = _mm256_setzero_ps();
+            __m256 _sum23 = _mm256_setzero_ps();
+            __m256 _sum45 = _mm256_setzero_ps();
+            __m256 _sum67 = _mm256_setzero_ps();
+#else
+            __m128 _sum1 = _mm_setzero_ps();
+            __m128 _sum2 = _mm_setzero_ps();
+            __m128 _sum3 = _mm_setzero_ps();
+#endif
+
+            if (bias_data_ptr)
+            {
+                _sum0 = _mm_loadu_ps(bias_data_ptr + p * 4);
+            }
+
+#if NCNN_IMPL_FP16S
+            const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+            const float* kptr = weight_data_tm.row(p);
+#endif
+            const float* sptr = bottom_blob;
+
+            int i = 0;
+#if __AVX__
+            for (; i + 7 < num_input; i += 8)
+            {
+                __m128 _val0 = _mm_broadcast_ss(sptr);
+                __m128 _val1 = _mm_broadcast_ss(sptr + 1);
+                __m128 _val2 = _mm_broadcast_ss(sptr + 2);
+                __m128 _val3 = _mm_broadcast_ss(sptr + 3);
+                __m128 _val4 = _mm_broadcast_ss(sptr + 4);
+                __m128 _val5 = _mm_broadcast_ss(sptr + 5);
+                __m128 _val6 = _mm_broadcast_ss(sptr + 6);
+                __m128 _val7 = _mm_broadcast_ss(sptr + 7);
+
+                __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1);
+                __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1);
+                __m256 _val45 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val4), _val5, 1);
+                __m256 _val67 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val6), _val7, 1);
+
+#if NCNN_IMPL_FP16S
+                __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr);
+                __m256i _w4567 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
+                __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0));
+                __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1));
+                __m256 _w45 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 0));
+                __m256 _w67 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 1));
+#else
+                __m256 _w01 = _mm256_loadu_ps(kptr);
+                __m256 _w23 = _mm256_loadu_ps(kptr + 8);
+                __m256 _w45 = _mm256_loadu_ps(kptr + 16);
+                __m256 _w67 = _mm256_loadu_ps(kptr + 24);
+#endif
+
+                _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01);
+                _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23);
+                _sum45 = _mm256_comp_fmadd_ps(_val45, _w45, _sum45);
+                _sum67 = _mm256_comp_fmadd_ps(_val67, _w67, _sum67);
+
+                sptr += 8;
+                kptr += 32;
+            }
+#endif
+            for (; i + 3 < num_input; i += 4)
+            {
+#if __AVX__
+                __m128 _val0 = _mm_broadcast_ss(sptr);
+                __m128 _val1 = _mm_broadcast_ss(sptr + 1);
+                __m128 _val2 = _mm_broadcast_ss(sptr + 2);
+                __m128 _val3 = _mm_broadcast_ss(sptr + 3);
+
+                __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1);
+                __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1);
+
+#if NCNN_IMPL_FP16S
+                __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr);
+                __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0));
+                __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1));
+#else
+                __m256 _w01 = _mm256_loadu_ps(kptr);
+                __m256 _w23 = _mm256_loadu_ps(kptr + 8);
+#endif
+
+                _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01);
+                _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23);
+#else
+                __m128 _val0 = _mm_set1_ps(sptr[0]);
+                __m128 _val1 = _mm_set1_ps(sptr[1]);
+                __m128 _val2 = _mm_set1_ps(sptr[2]);
+                __m128 _val3 = _mm_set1_ps(sptr[3]);
+
+                __m128 _w0 = _mm_loadu_ps(kptr);
+                __m128 _w1 = _mm_loadu_ps(kptr + 4);
+                __m128 _w2 = _mm_loadu_ps(kptr + 8);
+                __m128 _w3 = _mm_loadu_ps(kptr + 12);
+
+                _sum0 = _mm_comp_fmadd_ps(_val0, _w0, _sum0);
+                _sum1 = _mm_comp_fmadd_ps(_val1, _w1, _sum1);
+                _sum2 = _mm_comp_fmadd_ps(_val2, _w2, _sum2);
+                _sum3 = _mm_comp_fmadd_ps(_val3, _w3, _sum3);
+#endif
+
+                sptr += 4;
+                kptr += 16;
+            }
+            for (; i < num_input; i++)
+            {
+                __m128 _val = _mm_set1_ps(sptr[0]);
+#if NCNN_IMPL_FP16S
+                __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
+#else
+                __m128 _w = _mm_loadu_ps(kptr);
+#endif
+                _sum0 = _mm_comp_fmadd_ps(_val, _w, _sum0);
+
+                sptr += 1;
+                kptr += 4;
+            }
+
+#if __AVX__
+            _sum01 = _mm256_add_ps(_sum01, _sum23);
+            _sum45 = _mm256_add_ps(_sum45, _sum67);
+            _sum01 = _mm256_add_ps(_sum01, _sum45);
+
+            _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 0));
+            _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 1));
+#else
+            _sum0 = _mm_add_ps(_sum0, _sum1);
+            _sum2 = _mm_add_ps(_sum2, _sum3);
+            _sum0 = _mm_add_ps(_sum0, _sum2);
+#endif
+
+            _sum0 = activation_sse(_sum0, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            _mm_storeu_ps(outptr + p * 4, _sum0);
+        }
+    }
+#endif // __SSE2__
+
+    if (out_elempack == 1)
+    {
+#if __SSE2__
+#if __AVX__
+        int remain_outw_start = 0;
+        int nn_outw = outw >> 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_outw; pp++)
+        {
+            int p = pp * 8;
+
+            float sums[8] = {0.0f};
+            if (bias_data_ptr)
+            {
+                sums[0] = bias_data_ptr[p];
+                sums[1] = bias_data_ptr[p + 1];
+                sums[2] = bias_data_ptr[p + 2];
+                sums[3] = bias_data_ptr[p + 3];
+                sums[4] = bias_data_ptr[p + 4];
+                sums[5] = bias_data_ptr[p + 5];
+                sums[6] = bias_data_ptr[p + 6];
+                sums[7] = bias_data_ptr[p + 7];
+            }
+
+#if NCNN_IMPL_FP16S
+            const unsigned short* w0 = weight_data_tm.row<const unsigned short>(p);
+            const unsigned short* w1 = weight_data_tm.row<const unsigned short>(p + 1);
+            const unsigned short* w2 = weight_data_tm.row<const unsigned short>(p + 2);
+            const unsigned short* w3 = weight_data_tm.row<const unsigned short>(p + 3);
+            const unsigned short* w4 = weight_data_tm.row<const unsigned short>(p + 4);
+            const unsigned short* w5 = weight_data_tm.row<const unsigned short>(p + 5);
+            const unsigned short* w6 = weight_data_tm.row<const unsigned short>(p + 6);
+            const unsigned short* w7 = weight_data_tm.row<const unsigned short>(p + 7);
+#else
+            const float* w0 = (const float*)weight_data_tm + num_input * p;
+            const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
+            const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
+            const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);
+            const float* w4 = (const float*)weight_data_tm + num_input * (p + 4);
+            const float* w5 = (const float*)weight_data_tm + num_input * (p + 5);
+            const float* w6 = (const float*)weight_data_tm + num_input * (p + 6);
+            const float* w7 = (const float*)weight_data_tm + num_input * (p + 7);
+#endif
+            const float* m = bottom_blob;
+
+            __m256 _sum0 = _mm256_setzero_ps();
+            __m256 _sum1 = _mm256_setzero_ps();
+            __m256 _sum2 = _mm256_setzero_ps();
+            __m256 _sum3 = _mm256_setzero_ps();
+            __m256 _sum4 = _mm256_setzero_ps();
+            __m256 _sum5 = _mm256_setzero_ps();
+            __m256 _sum6 = _mm256_setzero_ps();
+            __m256 _sum7 = _mm256_setzero_ps();
+
+            int i = 0;
+            for (; i + 7 < num_input; i += 8)
+            {
+                __m256 _m = _mm256_loadu_ps(m);
+
+#if NCNN_IMPL_FP16S
+                __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0));
+                __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1));
+                __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2));
+                __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3));
+#else
+                __m256 _w0 = _mm256_loadu_ps(w0);
+                __m256 _w1 = _mm256_loadu_ps(w1);
+                __m256 _w2 = _mm256_loadu_ps(w2);
+                __m256 _w3 = _mm256_loadu_ps(w3);
+#endif
+
+                _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0);
+                _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1);
+                _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2);
+                _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3);
+
+#if NCNN_IMPL_FP16S
+                __m256 _w4 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w4));
+                __m256 _w5 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w5));
+                __m256 _w6 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w6));
+                __m256 _w7 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w7));
+#else
+                __m256 _w4 = _mm256_loadu_ps(w4);
+                __m256 _w5 = _mm256_loadu_ps(w5);
+                __m256 _w6 = _mm256_loadu_ps(w6);
+                __m256 _w7 = _mm256_loadu_ps(w7);
+#endif
+
+                _sum4 = _mm256_comp_fmadd_ps(_m, _w4, _sum4);
+                _sum5 = _mm256_comp_fmadd_ps(_m, _w5, _sum5);
+                _sum6 = _mm256_comp_fmadd_ps(_m, _w6, _sum6);
+                _sum7 = _mm256_comp_fmadd_ps(_m, _w7, _sum7);
+
+                m += 8;
+                w0 += 8;
+                w1 += 8;
+                w2 += 8;
+                w3 += 8;
+                w4 += 8;
+                w5 += 8;
+                w6 += 8;
+                w7 += 8;
+            }
+            for (; i < num_input; i++)
+            {
+#if NCNN_IMPL_FP16S
+                sums[0] += *m * float16_to_float32(*w0);
+                sums[1] += *m * float16_to_float32(*w1);
+                sums[2] += *m * float16_to_float32(*w2);
+                sums[3] += *m * float16_to_float32(*w3);
+                sums[4] += *m * float16_to_float32(*w4);
+                sums[5] += *m * float16_to_float32(*w5);
+                sums[6] += *m * float16_to_float32(*w6);
+                sums[7] += *m * float16_to_float32(*w7);
+#else
+                sums[0] += *m * *w0;
+                sums[1] += *m * *w1;
+                sums[2] += *m * *w2;
+                sums[3] += *m * *w3;
+                sums[4] += *m * *w4;
+                sums[5] += *m * *w5;
+                sums[6] += *m * *w6;
+                sums[7] += *m * *w7;
+#endif
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+                w4++;
+                w5++;
+                w6++;
+                w7++;
+            }
+
+            __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);
+            __m256 _sums_f = _mm256_loadu_ps(sums);
+            _sums = _mm256_add_ps(_sums_f, _sums);
+            _sums = activation_avx(_sums, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            _mm256_storeu_ps(outptr + p, _sums);
+        }
+
+        remain_outw_start += (nn_outw << 3);
+        nn_outw = (outw - remain_outw_start) >> 2;
+#else
+        int remain_outw_start = 0;
+        int nn_outw = outw >> 2;
+#endif // __AVX__
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_outw; pp++)
+        {
+            int p = remain_outw_start + (pp * 4);
+
+            float sums[4] = {0.0f};
+            if (bias_data_ptr)
+            {
+                sums[0] = bias_data_ptr[p];
+                sums[1] = bias_data_ptr[p + 1];
+                sums[2] = bias_data_ptr[p + 2];
+                sums[3] = bias_data_ptr[p + 3];
+            }
+
+#if NCNN_IMPL_FP16S
+            const unsigned short* w0 = weight_data_tm.row<const unsigned short>(p);
+            const unsigned short* w1 = weight_data_tm.row<const unsigned short>(p + 1);
+            const unsigned short* w2 = weight_data_tm.row<const unsigned short>(p + 2);
+            const unsigned short* w3 = weight_data_tm.row<const unsigned short>(p + 3);
+#else
+            const float* w0 = (const float*)weight_data_tm + num_input * p;
+            const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
+            const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
+            const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);
+#endif
+            const float* m = bottom_blob;
+
+            int i = 0;
+#if __AVX__
+            __m256 _sum0 = _mm256_setzero_ps();
+            __m256 _sum1 = _mm256_setzero_ps();
+            __m256 _sum2 = _mm256_setzero_ps();
+            __m256 _sum3 = _mm256_setzero_ps();
+            for (; i + 7 < num_input; i += 8)
+            {
+                __m256 _m = _mm256_loadu_ps(m);
+
+#if NCNN_IMPL_FP16S
+                __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0));
+                __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1));
+                __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2));
+                __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3));
+#else
+                __m256 _w0 = _mm256_loadu_ps(w0);
+                __m256 _w1 = _mm256_loadu_ps(w1);
+                __m256 _w2 = _mm256_loadu_ps(w2);
+                __m256 _w3 = _mm256_loadu_ps(w3);
+#endif
+
+                _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0);
+                _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1);
+                _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2);
+                _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3);
+
+                m += 8;
+                w0 += 8;
+                w1 += 8;
+                w2 += 8;
+                w3 += 8;
+            }
+#endif // __AVX__
+
+            __m128 _sum0l = _mm_setzero_ps();
+            __m128 _sum1l = _mm_setzero_ps();
+            __m128 _sum2l = _mm_setzero_ps();
+            __m128 _sum3l = _mm_setzero_ps();
+            for (; i + 3 < num_input; i += 4)
+            {
+                __m128 _m = _mm_loadu_ps(m);
+
+#if NCNN_IMPL_FP16S
+                __m128 _w0 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w0));
+                __m128 _w1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w1));
+                __m128 _w2 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w2));
+                __m128 _w3 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w3));
+#else
+                __m128 _w0 = _mm_loadu_ps(w0);
+                __m128 _w1 = _mm_loadu_ps(w1);
+                __m128 _w2 = _mm_loadu_ps(w2);
+                __m128 _w3 = _mm_loadu_ps(w3);
+#endif
+
+                _sum0l = _mm_comp_fmadd_ps(_m, _w0, _sum0l);
+                _sum1l = _mm_comp_fmadd_ps(_m, _w1, _sum1l);
+                _sum2l = _mm_comp_fmadd_ps(_m, _w2, _sum2l);
+                _sum3l = _mm_comp_fmadd_ps(_m, _w3, _sum3l);
+
+                m += 4;
+                w0 += 4;
+                w1 += 4;
+                w2 += 4;
+                w3 += 4;
+            }
+            for (; i < num_input; i++)
+            {
+#if NCNN_IMPL_FP16S
+                sums[0] += *m * float16_to_float32(*w0);
+                sums[1] += *m * float16_to_float32(*w1);
+                sums[2] += *m * float16_to_float32(*w2);
+                sums[3] += *m * float16_to_float32(*w3);
+#else
+                sums[0] += *m * *w0;
+                sums[1] += *m * *w1;
+                sums[2] += *m * *w2;
+                sums[3] += *m * *w3;
+#endif
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+            }
+
+            __m128 _sums = _mm_loadu_ps(sums);
+#if __AVX__
+            _sums = _mm_add_ps(HorizontalSums(_sum0, _sum1, _sum2, _sum3), _sums);
+#endif
+            _MM_TRANSPOSE4_PS(_sum0l, _sum1l, _sum2l, _sum3l);
+            _sums = _mm_add_ps(_sum0l, _sums);
+            _sums = _mm_add_ps(_sum1l, _sums);
+            _sums = _mm_add_ps(_sum2l, _sums);
+            _sums = _mm_add_ps(_sum3l, _sums);
+            _sums = activation_sse(_sums, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            _mm_storeu_ps(outptr + p, _sums);
+        }
+
+        remain_outw_start += (nn_outw << 2);
+#else
+        int remain_outw_start = 0;
+#endif // __SSE2__
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = remain_outw_start; p < outw; p++)
+        {
+            float sum = 0.f;
+
+            if (bias_data_ptr)
+                sum = bias_data_ptr[p];
+
+#if NCNN_IMPL_FP16S
+            const unsigned short* w = weight_data_tm.row<const unsigned short>(p);
+#else
+            const float* w = (const float*)weight_data_tm + num_input * p;
+#endif
+            const float* m = bottom_blob;
+
+            int i = 0;
+#if __SSE2__
+#if __AVX__
+            __m256 _sum = _mm256_setzero_ps();
+            for (; i + 7 < num_input; i += 8)
+            {
+                __m256 _m = _mm256_loadu_ps(m);
+#if NCNN_IMPL_FP16S
+                __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w));
+#else
+                __m256 _w = _mm256_loadu_ps(w);
+#endif
+                _sum = _mm256_comp_fmadd_ps(_m, _w, _sum);
+
+                m += 8;
+                w += 8;
+            }
+#endif // __AVX__
+            __m128 _suml = _mm_setzero_ps();
+            for (; i + 3 < num_input; i += 4)
+            {
+                __m128 _m = _mm_loadu_ps(m);
+#if NCNN_IMPL_FP16S
+                __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w));
+#else
+                __m128 _w = _mm_loadu_ps(w);
+#endif
+                _suml = _mm_comp_fmadd_ps(_m, _w, _suml);
+
+                m += 4;
+                w += 4;
+            }
+#endif // __SSE2__
+            for (; i < num_input; i++)
+            {
+#if NCNN_IMPL_FP16S
+                sum += *m * float16_to_float32(*w);
+#else
+                sum += *m * *w;
+#endif
+                m++;
+                w++;
+            }
+
+#if __SSE2__
+#if __AVX__
+            _suml = _mm_add_ps(_suml, _mm256_extractf128_ps(_sum, 1));
+            _suml = _mm_add_ps(_suml, _mm256_castps256_ps128(_sum));
+#endif // __AVX__
+            sum += _mm_reduce_add_ps(_suml);
+#endif // __SSE2__
+
+            sum = activation_ss(sum, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            outptr[p] = sum;
+        }
+    }
+#endif // NCNN_RUNTIME_CPU
+}
+
+#if NCNN_IMPL_FP16S
+static void innerproduct_transform_kernel_fp16s_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
+#else
+static void innerproduct_transform_kernel_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
+#endif
+{
+#if NCNN_RUNTIME_CPU && NCNN_IMPL_FP16S && NCNN_F16C && __AVX__ && !__F16C__
+    if (ncnn::cpu_support_x86_f16c())
+    {
+        innerproduct_transform_kernel_fp16s_sse_f16c(weight_data, weight_data_tm, num_input, num_output, opt);
+        return;
+    }
+#else // NCNN_RUNTIME_CPU
+
+    int out_elempack = 1;
+#if __SSE2__
+    if (opt.use_packing_layout)
+    {
+#if __AVX512F__
+        out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+#elif __AVX__
+        out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+#else
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+#endif
+    }
+#endif // __SSE2__
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    if (out_elempack == 16)
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+#if NCNN_IMPL_FP16S
+        weight_data_tm.create(num_input, num_output / 16, (size_t)32u, 16);
+#else
+        weight_data_tm.create(num_input, num_output / 16, (size_t)64u, 16);
+#endif
+
+        for (int q = 0; q + 15 < num_output; q += 16)
+        {
+#if NCNN_IMPL_FP16S
+            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 16);
+#else
+            float* g0 = weight_data_tm.row(q / 16);
+#endif
+
+            const float* k0 = weight_data_r2.row(q);
+            const float* k1 = weight_data_r2.row(q + 1);
+            const float* k2 = weight_data_r2.row(q + 2);
+            const float* k3 = weight_data_r2.row(q + 3);
+            const float* k4 = weight_data_r2.row(q + 4);
+            const float* k5 = weight_data_r2.row(q + 5);
+            const float* k6 = weight_data_r2.row(q + 6);
+            const float* k7 = weight_data_r2.row(q + 7);
+            const float* k8 = weight_data_r2.row(q + 8);
+            const float* k9 = weight_data_r2.row(q + 9);
+            const float* ka = weight_data_r2.row(q + 10);
+            const float* kb = weight_data_r2.row(q + 11);
+            const float* kc = weight_data_r2.row(q + 12);
+            const float* kd = weight_data_r2.row(q + 13);
+            const float* ke = weight_data_r2.row(q + 14);
+            const float* kf = weight_data_r2.row(q + 15);
+
+            int p = 0;
+            for (; p + 15 < num_input; p += 16)
+            {
+                // transpose 16x16
+#if NCNN_IMPL_FP16S
+                __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC);
+                __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC);
+                __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC);
+                __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC);
+                __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC);
+                __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC);
+                __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC);
+                __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC);
+                __m256i _r8 = _mm512_cvtps_ph(_mm512_loadu_ps(k8), _MM_FROUND_TRUNC);
+                __m256i _r9 = _mm512_cvtps_ph(_mm512_loadu_ps(k9), _MM_FROUND_TRUNC);
+                __m256i _ra = _mm512_cvtps_ph(_mm512_loadu_ps(ka), _MM_FROUND_TRUNC);
+                __m256i _rb = _mm512_cvtps_ph(_mm512_loadu_ps(kb), _MM_FROUND_TRUNC);
+                __m256i _rc = _mm512_cvtps_ph(_mm512_loadu_ps(kc), _MM_FROUND_TRUNC);
+                __m256i _rd = _mm512_cvtps_ph(_mm512_loadu_ps(kd), _MM_FROUND_TRUNC);
+                __m256i _re = _mm512_cvtps_ph(_mm512_loadu_ps(ke), _MM_FROUND_TRUNC);
+                __m256i _rf = _mm512_cvtps_ph(_mm512_loadu_ps(kf), _MM_FROUND_TRUNC);
+
+                transpose16x16_epi16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+
+                _mm256_storeu_si256((__m256i*)g0, _r0);
+                _mm256_storeu_si256((__m256i*)(g0 + 16), _r1);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 8), _r8);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 9), _r9);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 10), _ra);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 11), _rb);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 12), _rc);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 13), _rd);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 14), _re);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 15), _rf);
+#else
+                __m512 _r0 = _mm512_loadu_ps(k0);
+                __m512 _r1 = _mm512_loadu_ps(k1);
+                __m512 _r2 = _mm512_loadu_ps(k2);
+                __m512 _r3 = _mm512_loadu_ps(k3);
+                __m512 _r4 = _mm512_loadu_ps(k4);
+                __m512 _r5 = _mm512_loadu_ps(k5);
+                __m512 _r6 = _mm512_loadu_ps(k6);
+                __m512 _r7 = _mm512_loadu_ps(k7);
+                __m512 _r8 = _mm512_loadu_ps(k8);
+                __m512 _r9 = _mm512_loadu_ps(k9);
+                __m512 _ra = _mm512_loadu_ps(ka);
+                __m512 _rb = _mm512_loadu_ps(kb);
+                __m512 _rc = _mm512_loadu_ps(kc);
+                __m512 _rd = _mm512_loadu_ps(kd);
+                __m512 _re = _mm512_loadu_ps(ke);
+                __m512 _rf = _mm512_loadu_ps(kf);
+
+                transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+
+                _mm512_storeu_ps(g0, _r0);
+                _mm512_storeu_ps(g0 + 16, _r1);
+                _mm512_storeu_ps(g0 + 16 * 2, _r2);
+                _mm512_storeu_ps(g0 + 16 * 3, _r3);
+                _mm512_storeu_ps(g0 + 16 * 4, _r4);
+                _mm512_storeu_ps(g0 + 16 * 5, _r5);
+                _mm512_storeu_ps(g0 + 16 * 6, _r6);
+                _mm512_storeu_ps(g0 + 16 * 7, _r7);
+                _mm512_storeu_ps(g0 + 16 * 8, _r8);
+                _mm512_storeu_ps(g0 + 16 * 9, _r9);
+                _mm512_storeu_ps(g0 + 16 * 10, _ra);
+                _mm512_storeu_ps(g0 + 16 * 11, _rb);
+                _mm512_storeu_ps(g0 + 16 * 12, _rc);
+                _mm512_storeu_ps(g0 + 16 * 13, _rd);
+                _mm512_storeu_ps(g0 + 16 * 14, _re);
+                _mm512_storeu_ps(g0 + 16 * 15, _rf);
+#endif
+
+                k0 += 16;
+                k1 += 16;
+                k2 += 16;
+                k3 += 16;
+                k4 += 16;
+                k5 += 16;
+                k6 += 16;
+                k7 += 16;
+                k8 += 16;
+                k9 += 16;
+                ka += 16;
+                kb += 16;
+                kc += 16;
+                kd += 16;
+                ke += 16;
+                kf += 16;
+                g0 += 256;
+            }
+            for (; p + 7 < num_input; p += 8)
+            {
+                // transpose 8x16
+#if NCNN_IMPL_FP16S
+                __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC);
+                __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC);
+                __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC);
+                __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC);
+                __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC);
+                __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC);
+                __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC);
+                __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC);
+                __m128i _r8 = _mm256_cvtps_ph(_mm256_loadu_ps(k8), _MM_FROUND_TRUNC);
+                __m128i _r9 = _mm256_cvtps_ph(_mm256_loadu_ps(k9), _MM_FROUND_TRUNC);
+                __m128i _ra = _mm256_cvtps_ph(_mm256_loadu_ps(ka), _MM_FROUND_TRUNC);
+                __m128i _rb = _mm256_cvtps_ph(_mm256_loadu_ps(kb), _MM_FROUND_TRUNC);
+                __m128i _rc = _mm256_cvtps_ph(_mm256_loadu_ps(kc), _MM_FROUND_TRUNC);
+                __m128i _rd = _mm256_cvtps_ph(_mm256_loadu_ps(kd), _MM_FROUND_TRUNC);
+                __m128i _re = _mm256_cvtps_ph(_mm256_loadu_ps(ke), _MM_FROUND_TRUNC);
+                __m128i _rf = _mm256_cvtps_ph(_mm256_loadu_ps(kf), _MM_FROUND_TRUNC);
+
+                __m256i _r08 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r8, 1);
+                __m256i _r19 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r9, 1);
+                __m256i _r2a = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _ra, 1);
+                __m256i _r3b = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _rb, 1);
+                __m256i _r4c = _mm256_inserti128_si256(_mm256_castsi128_si256(_r4), _rc, 1);
+                __m256i _r5d = _mm256_inserti128_si256(_mm256_castsi128_si256(_r5), _rd, 1);
+                __m256i _r6e = _mm256_inserti128_si256(_mm256_castsi128_si256(_r6), _re, 1);
+                __m256i _r7f = _mm256_inserti128_si256(_mm256_castsi128_si256(_r7), _rf, 1);
+
+                __m256i _tmp0 = _mm256_unpacklo_epi16(_r08, _r19);
+                __m256i _tmp1 = _mm256_unpackhi_epi16(_r08, _r19);
+                __m256i _tmp2 = _mm256_unpacklo_epi16(_r2a, _r3b);
+                __m256i _tmp3 = _mm256_unpackhi_epi16(_r2a, _r3b);
+                __m256i _tmp4 = _mm256_unpacklo_epi16(_r4c, _r5d);
+                __m256i _tmp5 = _mm256_unpackhi_epi16(_r4c, _r5d);
+                __m256i _tmp6 = _mm256_unpacklo_epi16(_r6e, _r7f);
+                __m256i _tmp7 = _mm256_unpackhi_epi16(_r6e, _r7f);
+
+                __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
+                __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
+                __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
+                __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
+                __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
+                __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
+                __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
+                __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
+
+                _r08 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
+                _r19 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
+                _r2a = _mm256_unpacklo_epi64(_tmph, _tmpl);
+                _r3b = _mm256_unpackhi_epi64(_tmph, _tmpl);
+                _r4c = _mm256_unpacklo_epi64(_tmpi, _tmpm);
+                _r5d = _mm256_unpackhi_epi64(_tmpi, _tmpm);
+                _r6e = _mm256_unpacklo_epi64(_tmpj, _tmpn);
+                _r7f = _mm256_unpackhi_epi64(_tmpj, _tmpn);
+
+                _mm256_storeu_si256((__m256i*)g0, _r08);
+                _mm256_storeu_si256((__m256i*)(g0 + 16), _r19);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2a);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3b);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4c);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5d);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6e);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7f);
+#else
+                __m256 _r0 = _mm256_loadu_ps(k0);
+                __m256 _r1 = _mm256_loadu_ps(k1);
+                __m256 _r2 = _mm256_loadu_ps(k2);
+                __m256 _r3 = _mm256_loadu_ps(k3);
+                __m256 _r4 = _mm256_loadu_ps(k4);
+                __m256 _r5 = _mm256_loadu_ps(k5);
+                __m256 _r6 = _mm256_loadu_ps(k6);
+                __m256 _r7 = _mm256_loadu_ps(k7);
+                __m256 _r8 = _mm256_loadu_ps(k8);
+                __m256 _r9 = _mm256_loadu_ps(k9);
+                __m256 _ra = _mm256_loadu_ps(ka);
+                __m256 _rb = _mm256_loadu_ps(kb);
+                __m256 _rc = _mm256_loadu_ps(kc);
+                __m256 _rd = _mm256_loadu_ps(kd);
+                __m256 _re = _mm256_loadu_ps(ke);
+                __m256 _rf = _mm256_loadu_ps(kf);
+
+                transpose8x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+
+                _mm256_storeu_ps(g0, _r0);
+                _mm256_storeu_ps(g0 + 8, _r1);
+                _mm256_storeu_ps(g0 + 8 * 2, _r2);
+                _mm256_storeu_ps(g0 + 8 * 3, _r3);
+                _mm256_storeu_ps(g0 + 8 * 4, _r4);
+                _mm256_storeu_ps(g0 + 8 * 5, _r5);
+                _mm256_storeu_ps(g0 + 8 * 6, _r6);
+                _mm256_storeu_ps(g0 + 8 * 7, _r7);
+                _mm256_storeu_ps(g0 + 8 * 8, _r8);
+                _mm256_storeu_ps(g0 + 8 * 9, _r9);
+                _mm256_storeu_ps(g0 + 8 * 10, _ra);
+                _mm256_storeu_ps(g0 + 8 * 11, _rb);
+                _mm256_storeu_ps(g0 + 8 * 12, _rc);
+                _mm256_storeu_ps(g0 + 8 * 13, _rd);
+                _mm256_storeu_ps(g0 + 8 * 14, _re);
+                _mm256_storeu_ps(g0 + 8 * 15, _rf);
+#endif
+
+                k0 += 8;
+                k1 += 8;
+                k2 += 8;
+                k3 += 8;
+                k4 += 8;
+                k5 += 8;
+                k6 += 8;
+                k7 += 8;
+                k8 += 8;
+                k9 += 8;
+                ka += 8;
+                kb += 8;
+                kc += 8;
+                kd += 8;
+                ke += 8;
+                kf += 8;
+                g0 += 128;
+            }
+            for (; p < num_input; p++)
+            {
+#if NCNN_IMPL_FP16S
+                g0[0] = float32_to_float16(*k0++);
+                g0[1] = float32_to_float16(*k1++);
+                g0[2] = float32_to_float16(*k2++);
+                g0[3] = float32_to_float16(*k3++);
+                g0[4] = float32_to_float16(*k4++);
+                g0[5] = float32_to_float16(*k5++);
+                g0[6] = float32_to_float16(*k6++);
+                g0[7] = float32_to_float16(*k7++);
+                g0[8] = float32_to_float16(*k8++);
+                g0[9] = float32_to_float16(*k9++);
+                g0[10] = float32_to_float16(*ka++);
+                g0[11] = float32_to_float16(*kb++);
+                g0[12] = float32_to_float16(*kc++);
+                g0[13] = float32_to_float16(*kd++);
+                g0[14] = float32_to_float16(*ke++);
+                g0[15] = float32_to_float16(*kf++);
+#else
+                g0[0] = *k0++;
+                g0[1] = *k1++;
+                g0[2] = *k2++;
+                g0[3] = *k3++;
+                g0[4] = *k4++;
+                g0[5] = *k5++;
+                g0[6] = *k6++;
+                g0[7] = *k7++;
+                g0[8] = *k8++;
+                g0[9] = *k9++;
+                g0[10] = *ka++;
+                g0[11] = *kb++;
+                g0[12] = *kc++;
+                g0[13] = *kd++;
+                g0[14] = *ke++;
+                g0[15] = *kf++;
+#endif
+                g0 += 16;
+            }
+        }
+    }
+#endif // __AVX512F__
+
+    if (out_elempack == 8)
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+#if NCNN_IMPL_FP16S
+        weight_data_tm.create(num_input, num_output / 8, (size_t)16u, 8);
+#else
+        weight_data_tm.create(num_input, num_output / 8, (size_t)32u, 8);
+#endif
+
+        for (int q = 0; q + 7 < num_output; q += 8)
+        {
+#if NCNN_IMPL_FP16S
+            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 8);
+#else
+            float* g0 = weight_data_tm.row(q / 8);
+#endif
+
+            const float* k0 = weight_data_r2.row(q);
+            const float* k1 = weight_data_r2.row(q + 1);
+            const float* k2 = weight_data_r2.row(q + 2);
+            const float* k3 = weight_data_r2.row(q + 3);
+            const float* k4 = weight_data_r2.row(q + 4);
+            const float* k5 = weight_data_r2.row(q + 5);
+            const float* k6 = weight_data_r2.row(q + 6);
+            const float* k7 = weight_data_r2.row(q + 7);
+
+            int p = 0;
+#if __AVX512F__
+            for (; p + 15 < num_input; p += 16)
+            {
+                // transpose 16x8
+#if NCNN_IMPL_FP16S
+                __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC);
+                __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC);
+                __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC);
+                __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC);
+                __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC);
+                __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC);
+                __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC);
+                __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC);
+
+                transpose16x8_epi16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
+
+                _mm256_storeu_si256((__m256i*)g0, _r0);
+                _mm256_storeu_si256((__m256i*)(g0 + 16), _r1);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6);
+                _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7);
+#else
+                __m512 _r0 = _mm512_loadu_ps(k0);
+                __m512 _r1 = _mm512_loadu_ps(k1);
+                __m512 _r2 = _mm512_loadu_ps(k2);
+                __m512 _r3 = _mm512_loadu_ps(k3);
+                __m512 _r4 = _mm512_loadu_ps(k4);
+                __m512 _r5 = _mm512_loadu_ps(k5);
+                __m512 _r6 = _mm512_loadu_ps(k6);
+                __m512 _r7 = _mm512_loadu_ps(k7);
+
+                transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
+
+                _mm512_storeu_ps(g0, _r0);
+                _mm512_storeu_ps(g0 + 16, _r1);
+                _mm512_storeu_ps(g0 + 16 * 2, _r2);
+                _mm512_storeu_ps(g0 + 16 * 3, _r3);
+                _mm512_storeu_ps(g0 + 16 * 4, _r4);
+                _mm512_storeu_ps(g0 + 16 * 5, _r5);
+                _mm512_storeu_ps(g0 + 16 * 6, _r6);
+                _mm512_storeu_ps(g0 + 16 * 7, _r7);
+#endif
+
+                k0 += 16;
+                k1 += 16;
+                k2 += 16;
+                k3 += 16;
+                k4 += 16;
+                k5 += 16;
+                k6 += 16;
+                k7 += 16;
+                g0 += 128;
+            }
+#endif // __AVX512F__
+            for (; p + 7 < num_input; p += 8)
+            {
+                // transpose 8x8
+#if NCNN_IMPL_FP16S
+                __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC);
+                __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC);
+                __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC);
+                __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC);
+                __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC);
+                __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC);
+                __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC);
+                __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC);
+
+                transpose8x8_epi16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
+
+                _mm_storeu_si128((__m128i*)g0, _r0);
+                _mm_storeu_si128((__m128i*)(g0 + 8), _r1);
+                _mm_storeu_si128((__m128i*)(g0 + 16), _r2);
+                _mm_storeu_si128((__m128i*)(g0 + 24), _r3);
+                _mm_storeu_si128((__m128i*)(g0 + 32), _r4);
+                _mm_storeu_si128((__m128i*)(g0 + 40), _r5);
+                _mm_storeu_si128((__m128i*)(g0 + 48), _r6);
+                _mm_storeu_si128((__m128i*)(g0 + 56), _r7);
+#else
+                __m256 _r0 = _mm256_loadu_ps(k0);
+                __m256 _r1 = _mm256_loadu_ps(k1);
+                __m256 _r2 = _mm256_loadu_ps(k2);
+                __m256 _r3 = _mm256_loadu_ps(k3);
+                __m256 _r4 = _mm256_loadu_ps(k4);
+                __m256 _r5 = _mm256_loadu_ps(k5);
+                __m256 _r6 = _mm256_loadu_ps(k6);
+                __m256 _r7 = _mm256_loadu_ps(k7);
+
+                transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
+
+                _mm256_storeu_ps(g0, _r0);
+                _mm256_storeu_ps(g0 + 8, _r1);
+                _mm256_storeu_ps(g0 + 16, _r2);
+                _mm256_storeu_ps(g0 + 24, _r3);
+                _mm256_storeu_ps(g0 + 32, _r4);
+                _mm256_storeu_ps(g0 + 40, _r5);
+                _mm256_storeu_ps(g0 + 48, _r6);
+                _mm256_storeu_ps(g0 + 56, _r7);
+#endif
+
+                k0 += 8;
+                k1 += 8;
+                k2 += 8;
+                k3 += 8;
+                k4 += 8;
+                k5 += 8;
+                k6 += 8;
+                k7 += 8;
+                g0 += 64;
+            }
+            for (; p < num_input; p++)
+            {
+#if NCNN_IMPL_FP16S
+                g0[0] = float32_to_float16(*k0++);
+                g0[1] = float32_to_float16(*k1++);
+                g0[2] = float32_to_float16(*k2++);
+                g0[3] = float32_to_float16(*k3++);
+                g0[4] = float32_to_float16(*k4++);
+                g0[5] = float32_to_float16(*k5++);
+                g0[6] = float32_to_float16(*k6++);
+                g0[7] = float32_to_float16(*k7++);
+#else
+                g0[0] = *k0++;
+                g0[1] = *k1++;
+                g0[2] = *k2++;
+                g0[3] = *k3++;
+                g0[4] = *k4++;
+                g0[5] = *k5++;
+                g0[6] = *k6++;
+                g0[7] = *k7++;
+#endif
+                g0 += 8;
+            }
+        }
+    }
+#endif // __AVX__
+
+    if (out_elempack == 4)
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+#if NCNN_IMPL_FP16S
+        weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4);
+#else
+        weight_data_tm.create(num_input, num_output / 4, (size_t)16u, 4);
+#endif
+
+        for (int q = 0; q + 3 < num_output; q += 4)
+        {
+#if NCNN_IMPL_FP16S
+            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 4);
+#else
+            float* g0 = weight_data_tm.row(q / 4);
+#endif
+
+            const float* k0 = weight_data_r2.row(q);
+            const float* k1 = weight_data_r2.row(q + 1);
+            const float* k2 = weight_data_r2.row(q + 2);
+            const float* k3 = weight_data_r2.row(q + 3);
+
+            int p = 0;
+            for (; p + 3 < num_input; p += 4)
+            {
+                // transpose 4x4
+                __m128 _r0 = _mm_loadu_ps(k0);
+                __m128 _r1 = _mm_loadu_ps(k1);
+                __m128 _r2 = _mm_loadu_ps(k2);
+                __m128 _r3 = _mm_loadu_ps(k3);
+                _MM_TRANSPOSE4_PS(_r0, _r1, _r2, _r3);
+#if NCNN_IMPL_FP16S
+                __m256 _r01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r0), _r1, 1);
+                __m256 _r23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r2), _r3, 1);
+                __m128i _r01_fp16 = _mm256_cvtps_ph(_r01, _MM_FROUND_TRUNC);
+                __m128i _r23_fp16 = _mm256_cvtps_ph(_r23, _MM_FROUND_TRUNC);
+                _mm_storeu_si128((__m128i*)g0, _r01_fp16);
+                _mm_storeu_si128((__m128i*)(g0 + 8), _r23_fp16);
+#else
+                _mm_storeu_ps(g0, _r0);
+                _mm_storeu_ps(g0 + 4, _r1);
+                _mm_storeu_ps(g0 + 8, _r2);
+                _mm_storeu_ps(g0 + 12, _r3);
+#endif
+
+                k0 += 4;
+                k1 += 4;
+                k2 += 4;
+                k3 += 4;
+                g0 += 16;
+            }
+            for (; p < num_input; p++)
+            {
+#if NCNN_IMPL_FP16S
+                g0[0] = float32_to_float16(*k0++);
+                g0[1] = float32_to_float16(*k1++);
+                g0[2] = float32_to_float16(*k2++);
+                g0[3] = float32_to_float16(*k3++);
+#else
+                g0[0] = *k0++;
+                g0[1] = *k1++;
+                g0[2] = *k2++;
+                g0[3] = *k3++;
+#endif
+                g0 += 4;
+            }
+        }
+    }
+#endif // __SSE2__
+
+    if (out_elempack == 1)
+    {
+#if NCNN_IMPL_FP16S
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
+#else
+        weight_data_tm = weight_data;
+#endif
+    }
+#endif // NCNN_RUNTIME_CPU
+}
diff --git a/src/layer/x86/innerproduct_fp16s.h b/src/layer/x86/innerproduct_fp16s.h
deleted file mode 100644
index acf22ec2dd3..00000000000
--- a/src/layer/x86/innerproduct_fp16s.h
+++ /dev/null
@@ -1,1200 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#if __AVX512F__
-static void innerproduct_fp16s_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
-{
-    const int num_input = bottom_blob.w * bottom_blob.elempack;
-    const int num_output = top_blob.w;
-
-    const float* bias_data_ptr = bias_data;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < num_output; p++)
-    {
-        __m512 _sum0 = _mm512_setzero_ps();
-        __m512 _sum1 = _mm512_setzero_ps();
-        __m512 _sum2 = _mm512_setzero_ps();
-        __m512 _sum3 = _mm512_setzero_ps();
-        __m512 _sum4 = _mm512_setzero_ps();
-        __m512 _sum5 = _mm512_setzero_ps();
-        __m512 _sum6 = _mm512_setzero_ps();
-        __m512 _sum7 = _mm512_setzero_ps();
-
-        if (bias_data_ptr)
-        {
-            _sum0 = _mm512_loadu_ps(bias_data_ptr + p * 16);
-        }
-
-        const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
-
-        const float* sptr = bottom_blob;
-
-        int i = 0;
-        for (; i + 7 < num_input; i += 8)
-        {
-            __m512i _w01 = _mm512_loadu_si512(kptr);
-            __m512 _val0 = _mm512_set1_ps(sptr[0]);
-            __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0));
-            _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0);
-
-            __m512 _val1 = _mm512_set1_ps(sptr[1]);
-            __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1));
-            _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1);
-
-            __m512i _w23 = _mm512_loadu_si512(kptr + 32);
-            __m512 _val2 = _mm512_set1_ps(sptr[2]);
-            __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0));
-            _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2);
-
-            __m512 _val3 = _mm512_set1_ps(sptr[3]);
-            __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1));
-            _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3);
-
-            __m512i _w45 = _mm512_loadu_si512(kptr + 64);
-            __m512 _val4 = _mm512_set1_ps(sptr[4]);
-            __m512 _w4 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 0));
-            _sum4 = _mm512_fmadd_ps(_val4, _w4, _sum4);
-
-            __m512 _val5 = _mm512_set1_ps(sptr[5]);
-            __m512 _w5 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 1));
-            _sum5 = _mm512_fmadd_ps(_val5, _w5, _sum5);
-
-            __m512i _w67 = _mm512_loadu_si512(kptr + 96);
-            __m512 _val6 = _mm512_set1_ps(sptr[6]);
-            __m512 _w6 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 0));
-            _sum6 = _mm512_fmadd_ps(_val6, _w6, _sum6);
-
-            __m512 _val7 = _mm512_set1_ps(sptr[7]);
-            __m512 _w7 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 1));
-            _sum7 = _mm512_fmadd_ps(_val7, _w7, _sum7);
-
-            sptr += 8;
-            kptr += 128;
-        }
-        for (; i + 3 < num_input; i += 4)
-        {
-            __m512i _w01 = _mm512_loadu_si512(kptr);
-            __m512 _val0 = _mm512_set1_ps(sptr[0]);
-            __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0));
-            _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0);
-
-            __m512 _val1 = _mm512_set1_ps(sptr[1]);
-            __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1));
-            _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1);
-
-            __m512i _w23 = _mm512_loadu_si512(kptr + 32);
-            __m512 _val2 = _mm512_set1_ps(sptr[2]);
-            __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0));
-            _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2);
-
-            __m512 _val3 = _mm512_set1_ps(sptr[3]);
-            __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1));
-            _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3);
-
-            sptr += 4;
-            kptr += 64;
-        }
-        for (; i < num_input; i++)
-        {
-            __m512 _val = _mm512_set1_ps(sptr[0]);
-            __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr));
-            _sum0 = _mm512_fmadd_ps(_val, _w, _sum0);
-
-            sptr += 1;
-            kptr += 16;
-        }
-
-        _sum0 = _mm512_add_ps(_sum0, _sum1);
-        _sum2 = _mm512_add_ps(_sum2, _sum3);
-        _sum4 = _mm512_add_ps(_sum4, _sum5);
-        _sum6 = _mm512_add_ps(_sum6, _sum7);
-        _sum0 = _mm512_add_ps(_sum0, _sum2);
-        _sum4 = _mm512_add_ps(_sum4, _sum6);
-        _sum0 = _mm512_add_ps(_sum0, _sum4);
-
-        _sum0 = activation_avx512(_sum0, activation_type, activation_params);
-
-        float* outptr = top_blob;
-        _mm512_storeu_ps(outptr + p * 16, _sum0);
-    }
-}
-#endif // __AVX512F__
-
-#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
-void innerproduct_fp16s_pack8_avx_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
-void innerproduct_fp16s_pack4_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
-void innerproduct_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
-void innerproduct_transform_kernel_fp16s_sse_f16c(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
-#endif
-
-static void innerproduct_fp16s_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
-{
-#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
-    if (ncnn::cpu_support_x86_f16c())
-    {
-        innerproduct_fp16s_pack8_avx_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
-        return;
-    }
-#endif
-
-#if __F16C__
-    const int num_input = bottom_blob.w * bottom_blob.elempack;
-    const int num_output = top_blob.w;
-
-    const float* bias_data_ptr = bias_data;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < num_output; p++)
-    {
-        __m256 _sum0 = _mm256_setzero_ps();
-        __m256 _sum1 = _mm256_setzero_ps();
-        __m256 _sum2 = _mm256_setzero_ps();
-        __m256 _sum3 = _mm256_setzero_ps();
-        __m256 _sum4 = _mm256_setzero_ps();
-        __m256 _sum5 = _mm256_setzero_ps();
-        __m256 _sum6 = _mm256_setzero_ps();
-        __m256 _sum7 = _mm256_setzero_ps();
-
-        if (bias_data_ptr)
-        {
-            _sum0 = _mm256_loadu_ps(bias_data_ptr + p * 8);
-        }
-
-        const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
-
-        const float* sptr = bottom_blob;
-
-        int i = 0;
-        for (; i + 7 < num_input; i += 8)
-        {
-            __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr);
-            __m256 _val0 = _mm256_broadcast_ss(sptr);
-            __m128i _w0_fp16 = _mm256_extractf128_si256(_w01, 0);
-            __m256 _w0 = _mm256_cvtph_ps(_w0_fp16);
-            _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
-
-            __m256 _val1 = _mm256_broadcast_ss(sptr + 1);
-            __m128i _w1_fp16 = _mm256_extractf128_si256(_w01, 1);
-            __m256 _w1 = _mm256_cvtph_ps(_w1_fp16);
-            _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
-
-            __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
-            __m256 _val2 = _mm256_broadcast_ss(sptr + 2);
-            __m128i _w2_fp16 = _mm256_extractf128_si256(_w23, 0);
-            __m256 _w2 = _mm256_cvtph_ps(_w2_fp16);
-            _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
-
-            __m256 _val3 = _mm256_broadcast_ss(sptr + 3);
-            __m128i _w3_fp16 = _mm256_extractf128_si256(_w23, 1);
-            __m256 _w3 = _mm256_cvtph_ps(_w3_fp16);
-            _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
-
-            __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32));
-            __m256 _val4 = _mm256_broadcast_ss(sptr + 4);
-            __m128i _w4_fp16 = _mm256_extractf128_si256(_w45, 0);
-            __m256 _w4 = _mm256_cvtph_ps(_w4_fp16);
-            _sum4 = _mm256_comp_fmadd_ps(_val4, _w4, _sum4);
-
-            __m256 _val5 = _mm256_broadcast_ss(sptr + 5);
-            __m128i _w5_fp16 = _mm256_extractf128_si256(_w45, 1);
-            __m256 _w5 = _mm256_cvtph_ps(_w5_fp16);
-            _sum5 = _mm256_comp_fmadd_ps(_val5, _w5, _sum5);
-
-            __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48));
-            __m256 _val6 = _mm256_broadcast_ss(sptr + 6);
-            __m128i _w6_fp16 = _mm256_extractf128_si256(_w67, 0);
-            __m256 _w6 = _mm256_cvtph_ps(_w6_fp16);
-            _sum6 = _mm256_comp_fmadd_ps(_val6, _w6, _sum6);
-
-            __m256 _val7 = _mm256_broadcast_ss(sptr + 7);
-            __m128i _w7_fp16 = _mm256_extractf128_si256(_w67, 1);
-            __m256 _w7 = _mm256_cvtph_ps(_w7_fp16);
-            _sum7 = _mm256_comp_fmadd_ps(_val7, _w7, _sum7);
-
-            sptr += 8;
-            kptr += 64;
-        }
-        for (; i + 3 < num_input; i += 4)
-        {
-            __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr);
-            __m256 _val0 = _mm256_broadcast_ss(sptr);
-            __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0));
-            _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
-
-            __m256 _val1 = _mm256_broadcast_ss(sptr + 1);
-            __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1));
-            _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
-
-            __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
-            __m256 _val2 = _mm256_broadcast_ss(sptr + 2);
-            __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0));
-            _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
-
-            __m256 _val3 = _mm256_broadcast_ss(sptr + 3);
-            __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1));
-            _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
-
-            sptr += 4;
-            kptr += 32;
-        }
-        for (; i < num_input; i++)
-        {
-            __m256 _val = _mm256_set1_ps(sptr[0]);
-            __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr));
-            _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0);
-
-            sptr += 1;
-            kptr += 8;
-        }
-
-        _sum0 = _mm256_add_ps(_sum0, _sum1);
-        _sum2 = _mm256_add_ps(_sum2, _sum3);
-        _sum4 = _mm256_add_ps(_sum4, _sum5);
-        _sum6 = _mm256_add_ps(_sum6, _sum7);
-        _sum0 = _mm256_add_ps(_sum0, _sum2);
-        _sum4 = _mm256_add_ps(_sum4, _sum6);
-        _sum0 = _mm256_add_ps(_sum0, _sum4);
-
-        _sum0 = activation_avx(_sum0, activation_type, activation_params);
-
-        float* outptr = top_blob;
-        _mm256_storeu_ps(outptr + p * 8, _sum0);
-    }
-#else  // __F16C__
-    (void)bottom_blob;
-    (void)top_blob;
-    (void)weight_data_fp16;
-    (void)bias_data;
-    (void)activation_type;
-    (void)activation_params;
-    (void)opt;
-#endif // __F16C__
-}
-
-static void innerproduct_fp16s_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
-{
-#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
-    if (ncnn::cpu_support_x86_f16c())
-    {
-        innerproduct_fp16s_pack4_sse_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
-        return;
-    }
-#endif
-
-#if __F16C__
-    const int num_input = bottom_blob.w * bottom_blob.elempack;
-    const int num_output = top_blob.w;
-
-    const float* bias_data_ptr = bias_data;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < num_output; p++)
-    {
-        __m128 _sum0 = _mm_setzero_ps();
-
-        __m256 _sum01 = _mm256_setzero_ps();
-        __m256 _sum23 = _mm256_setzero_ps();
-        __m256 _sum45 = _mm256_setzero_ps();
-        __m256 _sum67 = _mm256_setzero_ps();
-
-        if (bias_data_ptr)
-        {
-            _sum0 = _mm_loadu_ps(bias_data_ptr + p * 4);
-        }
-
-        const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
-
-        const float* sptr = bottom_blob;
-
-        int i = 0;
-        for (; i + 7 < num_input; i += 8)
-        {
-            __m128 _val0 = _mm_broadcast_ss(sptr);
-            __m128 _val1 = _mm_broadcast_ss(sptr + 1);
-            __m128 _val2 = _mm_broadcast_ss(sptr + 2);
-            __m128 _val3 = _mm_broadcast_ss(sptr + 3);
-            __m128 _val4 = _mm_broadcast_ss(sptr + 4);
-            __m128 _val5 = _mm_broadcast_ss(sptr + 5);
-            __m128 _val6 = _mm_broadcast_ss(sptr + 6);
-            __m128 _val7 = _mm_broadcast_ss(sptr + 7);
-
-            __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1);
-            __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1);
-            __m256 _val45 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val4), _val5, 1);
-            __m256 _val67 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val6), _val7, 1);
-
-            __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr);
-            __m256i _w4567 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
-
-            __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0));
-            __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1));
-            __m256 _w45 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 0));
-            __m256 _w67 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 1));
-
-            _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01);
-            _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23);
-            _sum45 = _mm256_comp_fmadd_ps(_val45, _w45, _sum45);
-            _sum67 = _mm256_comp_fmadd_ps(_val67, _w67, _sum67);
-
-            sptr += 8;
-            kptr += 32;
-        }
-        for (; i + 3 < num_input; i += 4)
-        {
-            __m128 _val0 = _mm_set1_ps(sptr[0]);
-            __m128 _val1 = _mm_set1_ps(sptr[1]);
-            __m128 _val2 = _mm_set1_ps(sptr[2]);
-            __m128 _val3 = _mm_set1_ps(sptr[3]);
-
-            __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1);
-            __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1);
-
-            __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr);
-            __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0));
-            __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1));
-
-            _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01);
-            _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23);
-
-            sptr += 4;
-            kptr += 16;
-        }
-        for (; i < num_input; i++)
-        {
-            __m128 _val = _mm_set1_ps(sptr[0]);
-            __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
-            _sum0 = _mm_comp_fmadd_ps(_val, _w, _sum0);
-
-            sptr += 1;
-            kptr += 4;
-        }
-
-        _sum01 = _mm256_add_ps(_sum01, _sum23);
-        _sum45 = _mm256_add_ps(_sum45, _sum67);
-        _sum01 = _mm256_add_ps(_sum01, _sum45);
-
-        _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 0));
-        _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 1));
-
-        _sum0 = activation_sse(_sum0, activation_type, activation_params);
-
-        float* outptr = top_blob;
-        _mm_storeu_ps(outptr + p * 4, _sum0);
-    }
-#else  // __F16C__
-    (void)bottom_blob;
-    (void)top_blob;
-    (void)weight_data_fp16;
-    (void)bias_data;
-    (void)activation_type;
-    (void)activation_params;
-    (void)opt;
-#endif // __F16C__
-}
-
-static void innerproduct_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
-{
-#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
-    if (ncnn::cpu_support_x86_f16c())
-    {
-        innerproduct_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
-        return;
-    }
-#endif
-
-#if __F16C__
-    const int num_input = bottom_blob.w * bottom_blob.elempack;
-    const int num_output = top_blob.w;
-
-    const float* bias_data_ptr = bias_data;
-
-    int remain_num_output_start = 0;
-    int nn_num_output = num_output >> 3;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp = 0; pp < nn_num_output; pp++)
-    {
-        int p = pp * 8;
-
-        float sums[8] = {0.0f};
-        if (bias_data_ptr)
-        {
-            sums[0] = bias_data_ptr[p];
-            sums[1] = bias_data_ptr[p + 1];
-            sums[2] = bias_data_ptr[p + 2];
-            sums[3] = bias_data_ptr[p + 3];
-            sums[4] = bias_data_ptr[p + 4];
-            sums[5] = bias_data_ptr[p + 5];
-            sums[6] = bias_data_ptr[p + 6];
-            sums[7] = bias_data_ptr[p + 7];
-        }
-
-        const unsigned short* w0 = weight_data_fp16.row<const unsigned short>(p);
-        const unsigned short* w1 = weight_data_fp16.row<const unsigned short>(p + 1);
-        const unsigned short* w2 = weight_data_fp16.row<const unsigned short>(p + 2);
-        const unsigned short* w3 = weight_data_fp16.row<const unsigned short>(p + 3);
-        const unsigned short* w4 = weight_data_fp16.row<const unsigned short>(p + 4);
-        const unsigned short* w5 = weight_data_fp16.row<const unsigned short>(p + 5);
-        const unsigned short* w6 = weight_data_fp16.row<const unsigned short>(p + 6);
-        const unsigned short* w7 = weight_data_fp16.row<const unsigned short>(p + 7);
-
-        const float* m = bottom_blob;
-
-        __m256 _sum0 = _mm256_setzero_ps();
-        __m256 _sum1 = _mm256_setzero_ps();
-        __m256 _sum2 = _mm256_setzero_ps();
-        __m256 _sum3 = _mm256_setzero_ps();
-        __m256 _sum4 = _mm256_setzero_ps();
-        __m256 _sum5 = _mm256_setzero_ps();
-        __m256 _sum6 = _mm256_setzero_ps();
-        __m256 _sum7 = _mm256_setzero_ps();
-
-        int i = 0;
-        for (; i + 7 < num_input; i += 8)
-        {
-            __m256 _m = _mm256_loadu_ps(m);
-
-            __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0));
-            __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1));
-            __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2));
-            __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3));
-            __m256 _w4 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w4));
-            __m256 _w5 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w5));
-            __m256 _w6 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w6));
-            __m256 _w7 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w7));
-
-            _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0);
-            _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1);
-            _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2);
-            _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3);
-            _sum4 = _mm256_comp_fmadd_ps(_m, _w4, _sum4);
-            _sum5 = _mm256_comp_fmadd_ps(_m, _w5, _sum5);
-            _sum6 = _mm256_comp_fmadd_ps(_m, _w6, _sum6);
-            _sum7 = _mm256_comp_fmadd_ps(_m, _w7, _sum7);
-
-            m += 8;
-            w0 += 8;
-            w1 += 8;
-            w2 += 8;
-            w3 += 8;
-            w4 += 8;
-            w5 += 8;
-            w6 += 8;
-            w7 += 8;
-        }
-        for (; i < num_input; i++)
-        {
-            sums[0] += *m * float16_to_float32(*w0);
-            sums[1] += *m * float16_to_float32(*w1);
-            sums[2] += *m * float16_to_float32(*w2);
-            sums[3] += *m * float16_to_float32(*w3);
-            sums[4] += *m * float16_to_float32(*w4);
-            sums[5] += *m * float16_to_float32(*w5);
-            sums[6] += *m * float16_to_float32(*w6);
-            sums[7] += *m * float16_to_float32(*w7);
-
-            m++;
-            w0++;
-            w1++;
-            w2++;
-            w3++;
-            w4++;
-            w5++;
-            w6++;
-            w7++;
-        }
-
-        __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);
-        __m256 _sums_f = _mm256_loadu_ps(sums);
-        _sums = _mm256_add_ps(_sums_f, _sums);
-        _sums = activation_avx(_sums, activation_type, activation_params);
-
-        float* outptr = top_blob;
-        _mm256_storeu_ps(outptr + p, _sums);
-    }
-
-    remain_num_output_start += (nn_num_output << 3);
-    nn_num_output = (num_output - remain_num_output_start) >> 2;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp = 0; pp < nn_num_output; pp++)
-    {
-        int p = remain_num_output_start + (pp * 4);
-
-        float sums[4] = {0.0f};
-        if (bias_data_ptr)
-        {
-            sums[0] = bias_data_ptr[p];
-            sums[1] = bias_data_ptr[p + 1];
-            sums[2] = bias_data_ptr[p + 2];
-            sums[3] = bias_data_ptr[p + 3];
-        }
-
-        const unsigned short* w0 = weight_data_fp16.row<const unsigned short>(p);
-        const unsigned short* w1 = weight_data_fp16.row<const unsigned short>(p + 1);
-        const unsigned short* w2 = weight_data_fp16.row<const unsigned short>(p + 2);
-        const unsigned short* w3 = weight_data_fp16.row<const unsigned short>(p + 3);
-
-        const float* m = bottom_blob;
-
-        int i = 0;
-
-        __m256 _sum0 = _mm256_setzero_ps();
-        __m256 _sum1 = _mm256_setzero_ps();
-        __m256 _sum2 = _mm256_setzero_ps();
-        __m256 _sum3 = _mm256_setzero_ps();
-        for (; i + 7 < num_input; i += 8)
-        {
-            __m256 _m = _mm256_loadu_ps(m);
-
-            __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0));
-            __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1));
-            __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2));
-            __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3));
-
-            _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0);
-            _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1);
-            _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2);
-            _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3);
-
-            m += 8;
-            w0 += 8;
-            w1 += 8;
-            w2 += 8;
-            w3 += 8;
-        }
-
-        __m128 _sum0l = _mm_setzero_ps();
-        __m128 _sum1l = _mm_setzero_ps();
-        __m128 _sum2l = _mm_setzero_ps();
-        __m128 _sum3l = _mm_setzero_ps();
-        for (; i + 3 < num_input; i += 4)
-        {
-            __m128 _m = _mm_loadu_ps(m);
-
-            __m128 _w0 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w0));
-            __m128 _w1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w1));
-            __m128 _w2 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w2));
-            __m128 _w3 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w3));
-
-            _sum0l = _mm_comp_fmadd_ps(_m, _w0, _sum0l);
-            _sum1l = _mm_comp_fmadd_ps(_m, _w1, _sum1l);
-            _sum2l = _mm_comp_fmadd_ps(_m, _w2, _sum2l);
-            _sum3l = _mm_comp_fmadd_ps(_m, _w3, _sum3l);
-
-            m += 4;
-            w0 += 4;
-            w1 += 4;
-            w2 += 4;
-            w3 += 4;
-        }
-        for (; i < num_input; i++)
-        {
-            sums[0] += *m * float16_to_float32(*w0);
-            sums[1] += *m * float16_to_float32(*w1);
-            sums[2] += *m * float16_to_float32(*w2);
-            sums[3] += *m * float16_to_float32(*w3);
-
-            m++;
-            w0++;
-            w1++;
-            w2++;
-            w3++;
-        }
-
-        __m128 _sums = _mm_loadu_ps(sums);
-
-        _sums = _mm_add_ps(HorizontalSums(_sum0, _sum1, _sum2, _sum3), _sums);
-
-        _MM_TRANSPOSE4_PS(_sum0l, _sum1l, _sum2l, _sum3l);
-        _sums = _mm_add_ps(_sum0l, _sums);
-        _sums = _mm_add_ps(_sum1l, _sums);
-        _sums = _mm_add_ps(_sum2l, _sums);
-        _sums = _mm_add_ps(_sum3l, _sums);
-        _sums = activation_sse(_sums, activation_type, activation_params);
-
-        float* outptr = top_blob;
-        _mm_storeu_ps(outptr + p, _sums);
-    }
-
-    remain_num_output_start += (nn_num_output << 2);
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = remain_num_output_start; p < num_output; p++)
-    {
-        float sum = 0.f;
-
-        if (bias_data_ptr)
-            sum = bias_data_ptr[p];
-
-        const unsigned short* w = weight_data_fp16.row<const unsigned short>(p);
-
-        const float* m = bottom_blob;
-
-        int i = 0;
-
-        __m256 _sum = _mm256_setzero_ps();
-        for (; i + 7 < num_input; i += 8)
-        {
-            __m256 _m = _mm256_loadu_ps(m);
-            __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w));
-            _sum = _mm256_comp_fmadd_ps(_m, _w, _sum);
-
-            m += 8;
-            w += 8;
-        }
-
-        __m128 _suml = _mm_setzero_ps();
-        for (; i + 3 < num_input; i += 4)
-        {
-            __m128 _m = _mm_loadu_ps(m);
-            __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w));
-            _suml = _mm_comp_fmadd_ps(_m, _w, _suml);
-
-            m += 4;
-            w += 4;
-        }
-        for (; i < num_input; i++)
-        {
-            sum += *m * float16_to_float32(*w);
-            m++;
-            w++;
-        }
-
-        sum += _mm256_reduce_add_ps(_sum);
-
-        sum += _mm_reduce_add_ps(_suml);
-
-        sum = activation_ss(sum, activation_type, activation_params);
-
-        float* outptr = top_blob;
-        outptr[p] = sum;
-    }
-#else  // __F16C__
-    (void)bottom_blob;
-    (void)top_blob;
-    (void)weight_data_fp16;
-    (void)bias_data;
-    (void)activation_type;
-    (void)activation_params;
-    (void)opt;
-#endif // __F16C__
-}
-
-static void innerproduct_transform_kernel_fp16s_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
-{
-#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
-    if (ncnn::cpu_support_x86_f16c())
-    {
-        innerproduct_transform_kernel_fp16s_sse_f16c(weight_data, weight_data_tm, num_input, num_output, opt);
-        return;
-    }
-#endif
-
-#if __F16C__
-    int out_elempack = 1;
-    if (opt.use_packing_layout)
-    {
-#if __AVX512F__
-        out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-#else
-        out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-#endif
-    }
-
-    // src = inch-outch
-    // dst = pb-inch-outch/pb
-#if __AVX512F__
-    if (out_elempack == 16)
-    {
-        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
-
-        weight_data_tm.create(num_input, num_output / 16, (size_t)32u, 16);
-
-        for (int q = 0; q + 15 < num_output; q += 16)
-        {
-            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 16);
-
-            const float* k0 = weight_data_r2.row(q);
-            const float* k1 = weight_data_r2.row(q + 1);
-            const float* k2 = weight_data_r2.row(q + 2);
-            const float* k3 = weight_data_r2.row(q + 3);
-            const float* k4 = weight_data_r2.row(q + 4);
-            const float* k5 = weight_data_r2.row(q + 5);
-            const float* k6 = weight_data_r2.row(q + 6);
-            const float* k7 = weight_data_r2.row(q + 7);
-            const float* k8 = weight_data_r2.row(q + 8);
-            const float* k9 = weight_data_r2.row(q + 9);
-            const float* ka = weight_data_r2.row(q + 10);
-            const float* kb = weight_data_r2.row(q + 11);
-            const float* kc = weight_data_r2.row(q + 12);
-            const float* kd = weight_data_r2.row(q + 13);
-            const float* ke = weight_data_r2.row(q + 14);
-            const float* kf = weight_data_r2.row(q + 15);
-
-            int p = 0;
-            for (; p + 15 < num_input; p += 16)
-            {
-                // transpose 16x16
-                __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC);
-                __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC);
-                __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC);
-                __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC);
-                __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC);
-                __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC);
-                __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC);
-                __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC);
-                __m256i _r8 = _mm512_cvtps_ph(_mm512_loadu_ps(k8), _MM_FROUND_TRUNC);
-                __m256i _r9 = _mm512_cvtps_ph(_mm512_loadu_ps(k9), _MM_FROUND_TRUNC);
-                __m256i _ra = _mm512_cvtps_ph(_mm512_loadu_ps(ka), _MM_FROUND_TRUNC);
-                __m256i _rb = _mm512_cvtps_ph(_mm512_loadu_ps(kb), _MM_FROUND_TRUNC);
-                __m256i _rc = _mm512_cvtps_ph(_mm512_loadu_ps(kc), _MM_FROUND_TRUNC);
-                __m256i _rd = _mm512_cvtps_ph(_mm512_loadu_ps(kd), _MM_FROUND_TRUNC);
-                __m256i _re = _mm512_cvtps_ph(_mm512_loadu_ps(ke), _MM_FROUND_TRUNC);
-                __m256i _rf = _mm512_cvtps_ph(_mm512_loadu_ps(kf), _MM_FROUND_TRUNC);
-
-                __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
-                __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
-                __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
-                __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
-                __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
-                __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
-                __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
-                __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
-                __m256i _tmp8 = _mm256_unpacklo_epi16(_r8, _r9);
-                __m256i _tmp9 = _mm256_unpackhi_epi16(_r8, _r9);
-                __m256i _tmpa = _mm256_unpacklo_epi16(_ra, _rb);
-                __m256i _tmpb = _mm256_unpackhi_epi16(_ra, _rb);
-                __m256i _tmpc = _mm256_unpacklo_epi16(_rc, _rd);
-                __m256i _tmpd = _mm256_unpackhi_epi16(_rc, _rd);
-                __m256i _tmpe = _mm256_unpacklo_epi16(_re, _rf);
-                __m256i _tmpf = _mm256_unpackhi_epi16(_re, _rf);
-
-                __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
-                __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
-                __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
-                __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
-                __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
-                __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
-                __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
-                __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
-                __m256i _tmpo = _mm256_unpacklo_epi32(_tmp8, _tmpa);
-                __m256i _tmpp = _mm256_unpackhi_epi32(_tmp8, _tmpa);
-                __m256i _tmpq = _mm256_unpacklo_epi32(_tmp9, _tmpb);
-                __m256i _tmpr = _mm256_unpackhi_epi32(_tmp9, _tmpb);
-                __m256i _tmps = _mm256_unpacklo_epi32(_tmpc, _tmpe);
-                __m256i _tmpt = _mm256_unpackhi_epi32(_tmpc, _tmpe);
-                __m256i _tmpu = _mm256_unpacklo_epi32(_tmpd, _tmpf);
-                __m256i _tmpv = _mm256_unpackhi_epi32(_tmpd, _tmpf);
-
-                _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
-                _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
-                _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
-                _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
-                _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
-                _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
-                _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
-                _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
-                _tmp8 = _mm256_unpacklo_epi64(_tmpo, _tmps);
-                _tmp9 = _mm256_unpackhi_epi64(_tmpo, _tmps);
-                _tmpa = _mm256_unpacklo_epi64(_tmpp, _tmpt);
-                _tmpb = _mm256_unpackhi_epi64(_tmpp, _tmpt);
-                _tmpc = _mm256_unpacklo_epi64(_tmpq, _tmpu);
-                _tmpd = _mm256_unpackhi_epi64(_tmpq, _tmpu);
-                _tmpe = _mm256_unpacklo_epi64(_tmpr, _tmpv);
-                _tmpf = _mm256_unpackhi_epi64(_tmpr, _tmpv);
-
-                _r0 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 2, 0, 0));
-                _r1 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 2, 0, 0));
-                _r2 = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 2, 0, 0));
-                _r3 = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 2, 0, 0));
-                _r4 = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
-                _r5 = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
-                _r6 = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
-                _r7 = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
-                _r8 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 3, 0, 1));
-                _r9 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 3, 0, 1));
-                _ra = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 3, 0, 1));
-                _rb = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 3, 0, 1));
-                _rc = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
-                _rd = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
-                _re = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
-                _rf = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
-
-                _mm256_storeu_si256((__m256i*)g0, _r0);
-                _mm256_storeu_si256((__m256i*)(g0 + 16), _r1);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 8), _r8);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 9), _r9);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 10), _ra);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 11), _rb);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 12), _rc);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 13), _rd);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 14), _re);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 15), _rf);
-
-                k0 += 16;
-                k1 += 16;
-                k2 += 16;
-                k3 += 16;
-                k4 += 16;
-                k5 += 16;
-                k6 += 16;
-                k7 += 16;
-                k8 += 16;
-                k9 += 16;
-                ka += 16;
-                kb += 16;
-                kc += 16;
-                kd += 16;
-                ke += 16;
-                kf += 16;
-                g0 += 256;
-            }
-            for (; p + 7 < num_input; p += 8)
-            {
-                // transpose 16x8
-                __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC);
-                __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC);
-                __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC);
-                __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC);
-                __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC);
-                __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC);
-                __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC);
-                __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC);
-                __m128i _r8 = _mm256_cvtps_ph(_mm256_loadu_ps(k8), _MM_FROUND_TRUNC);
-                __m128i _r9 = _mm256_cvtps_ph(_mm256_loadu_ps(k9), _MM_FROUND_TRUNC);
-                __m128i _ra = _mm256_cvtps_ph(_mm256_loadu_ps(ka), _MM_FROUND_TRUNC);
-                __m128i _rb = _mm256_cvtps_ph(_mm256_loadu_ps(kb), _MM_FROUND_TRUNC);
-                __m128i _rc = _mm256_cvtps_ph(_mm256_loadu_ps(kc), _MM_FROUND_TRUNC);
-                __m128i _rd = _mm256_cvtps_ph(_mm256_loadu_ps(kd), _MM_FROUND_TRUNC);
-                __m128i _re = _mm256_cvtps_ph(_mm256_loadu_ps(ke), _MM_FROUND_TRUNC);
-                __m128i _rf = _mm256_cvtps_ph(_mm256_loadu_ps(kf), _MM_FROUND_TRUNC);
-
-                __m256i _r08 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r8, 1);
-                __m256i _r19 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r9, 1);
-                __m256i _r2a = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _ra, 1);
-                __m256i _r3b = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _rb, 1);
-                __m256i _r4c = _mm256_inserti128_si256(_mm256_castsi128_si256(_r4), _rc, 1);
-                __m256i _r5d = _mm256_inserti128_si256(_mm256_castsi128_si256(_r5), _rd, 1);
-                __m256i _r6e = _mm256_inserti128_si256(_mm256_castsi128_si256(_r6), _re, 1);
-                __m256i _r7f = _mm256_inserti128_si256(_mm256_castsi128_si256(_r7), _rf, 1);
-
-                __m256i _tmp0 = _mm256_unpacklo_epi16(_r08, _r19);
-                __m256i _tmp1 = _mm256_unpackhi_epi16(_r08, _r19);
-                __m256i _tmp2 = _mm256_unpacklo_epi16(_r2a, _r3b);
-                __m256i _tmp3 = _mm256_unpackhi_epi16(_r2a, _r3b);
-                __m256i _tmp4 = _mm256_unpacklo_epi16(_r4c, _r5d);
-                __m256i _tmp5 = _mm256_unpackhi_epi16(_r4c, _r5d);
-                __m256i _tmp6 = _mm256_unpacklo_epi16(_r6e, _r7f);
-                __m256i _tmp7 = _mm256_unpackhi_epi16(_r6e, _r7f);
-
-                __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
-                __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
-                __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
-                __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
-                __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
-                __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
-                __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
-                __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
-
-                _r08 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
-                _r19 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
-                _r2a = _mm256_unpacklo_epi64(_tmph, _tmpl);
-                _r3b = _mm256_unpackhi_epi64(_tmph, _tmpl);
-                _r4c = _mm256_unpacklo_epi64(_tmpi, _tmpm);
-                _r5d = _mm256_unpackhi_epi64(_tmpi, _tmpm);
-                _r6e = _mm256_unpacklo_epi64(_tmpj, _tmpn);
-                _r7f = _mm256_unpackhi_epi64(_tmpj, _tmpn);
-
-                _mm256_storeu_si256((__m256i*)g0, _r08);
-                _mm256_storeu_si256((__m256i*)(g0 + 16), _r19);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2a);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3b);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4c);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5d);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6e);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7f);
-
-                k0 += 8;
-                k1 += 8;
-                k2 += 8;
-                k3 += 8;
-                k4 += 8;
-                k5 += 8;
-                k6 += 8;
-                k7 += 8;
-                k8 += 8;
-                k9 += 8;
-                ka += 8;
-                kb += 8;
-                kc += 8;
-                kd += 8;
-                ke += 8;
-                kf += 8;
-                g0 += 128;
-            }
-            for (; p < num_input; p++)
-            {
-                g0[0] = float32_to_float16(*k0++);
-                g0[1] = float32_to_float16(*k1++);
-                g0[2] = float32_to_float16(*k2++);
-                g0[3] = float32_to_float16(*k3++);
-                g0[4] = float32_to_float16(*k4++);
-                g0[5] = float32_to_float16(*k5++);
-                g0[6] = float32_to_float16(*k6++);
-                g0[7] = float32_to_float16(*k7++);
-                g0[8] = float32_to_float16(*k8++);
-                g0[9] = float32_to_float16(*k9++);
-                g0[10] = float32_to_float16(*ka++);
-                g0[11] = float32_to_float16(*kb++);
-                g0[12] = float32_to_float16(*kc++);
-                g0[13] = float32_to_float16(*kd++);
-                g0[14] = float32_to_float16(*ke++);
-                g0[15] = float32_to_float16(*kf++);
-                g0 += 16;
-            }
-        }
-    }
-#endif // __AVX512F__
-
-    if (out_elempack == 8)
-    {
-        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
-
-        weight_data_tm.create(num_input, num_output / 8, (size_t)16u, 8);
-
-        for (int q = 0; q + 7 < num_output; q += 8)
-        {
-            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 8);
-
-            const float* k0 = weight_data_r2.row(q);
-            const float* k1 = weight_data_r2.row(q + 1);
-            const float* k2 = weight_data_r2.row(q + 2);
-            const float* k3 = weight_data_r2.row(q + 3);
-            const float* k4 = weight_data_r2.row(q + 4);
-            const float* k5 = weight_data_r2.row(q + 5);
-            const float* k6 = weight_data_r2.row(q + 6);
-            const float* k7 = weight_data_r2.row(q + 7);
-
-            int p = 0;
-#if __AVX512F__
-            for (; p + 15 < num_input; p += 16)
-            {
-                // transpose 8x16
-                __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC);
-                __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC);
-                __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC);
-                __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC);
-                __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC);
-                __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC);
-                __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC);
-                __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC);
-
-                __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
-                __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
-                __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
-                __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
-                __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
-                __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
-                __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
-                __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
-
-                __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
-                __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
-                __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
-                __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
-                __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
-                __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
-                __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
-                __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
-
-                _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
-                _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
-                _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
-                _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
-                _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
-                _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
-                _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
-                _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
-
-                _r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
-                _r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0));
-                _r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
-                _r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
-                _r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
-                _r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1));
-                _r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
-                _r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
-
-                _mm256_storeu_si256((__m256i*)g0, _r0);
-                _mm256_storeu_si256((__m256i*)(g0 + 16), _r1);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6);
-                _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7);
-
-                k0 += 16;
-                k1 += 16;
-                k2 += 16;
-                k3 += 16;
-                k4 += 16;
-                k5 += 16;
-                k6 += 16;
-                k7 += 16;
-                g0 += 128;
-            }
-#endif // __AVX512F__
-            for (; p + 7 < num_input; p += 8)
-            {
-                // transpose 8x8
-                __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC);
-                __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC);
-                __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC);
-                __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC);
-                __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC);
-                __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC);
-                __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC);
-                __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC);
-
-                __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
-                __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
-                __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
-                __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
-                __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
-                __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
-                __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
-                __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
-
-                __m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2);
-                __m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2);
-                __m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3);
-                __m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3);
-                __m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6);
-                __m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6);
-                __m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7);
-                __m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7);
-
-                _r0 = _mm_unpacklo_epi64(_tmp8, _tmpc);
-                _r1 = _mm_unpackhi_epi64(_tmp8, _tmpc);
-                _r2 = _mm_unpacklo_epi64(_tmp9, _tmpd);
-                _r3 = _mm_unpackhi_epi64(_tmp9, _tmpd);
-                _r4 = _mm_unpacklo_epi64(_tmpa, _tmpe);
-                _r5 = _mm_unpackhi_epi64(_tmpa, _tmpe);
-                _r6 = _mm_unpacklo_epi64(_tmpb, _tmpf);
-                _r7 = _mm_unpackhi_epi64(_tmpb, _tmpf);
-
-                _mm_storeu_si128((__m128i*)g0, _r0);
-                _mm_storeu_si128((__m128i*)(g0 + 8), _r1);
-                _mm_storeu_si128((__m128i*)(g0 + 16), _r2);
-                _mm_storeu_si128((__m128i*)(g0 + 24), _r3);
-                _mm_storeu_si128((__m128i*)(g0 + 32), _r4);
-                _mm_storeu_si128((__m128i*)(g0 + 40), _r5);
-                _mm_storeu_si128((__m128i*)(g0 + 48), _r6);
-                _mm_storeu_si128((__m128i*)(g0 + 56), _r7);
-
-                k0 += 8;
-                k1 += 8;
-                k2 += 8;
-                k3 += 8;
-                k4 += 8;
-                k5 += 8;
-                k6 += 8;
-                k7 += 8;
-                g0 += 64;
-            }
-            for (; p < num_input; p++)
-            {
-                g0[0] = float32_to_float16(*k0++);
-                g0[1] = float32_to_float16(*k1++);
-                g0[2] = float32_to_float16(*k2++);
-                g0[3] = float32_to_float16(*k3++);
-                g0[4] = float32_to_float16(*k4++);
-                g0[5] = float32_to_float16(*k5++);
-                g0[6] = float32_to_float16(*k6++);
-                g0[7] = float32_to_float16(*k7++);
-                g0 += 8;
-            }
-        }
-    }
-
-    if (out_elempack == 4)
-    {
-        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
-
-        weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4);
-
-        for (int q = 0; q + 3 < num_output; q += 4)
-        {
-            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 4);
-
-            const float* k0 = weight_data_r2.row(q);
-            const float* k1 = weight_data_r2.row(q + 1);
-            const float* k2 = weight_data_r2.row(q + 2);
-            const float* k3 = weight_data_r2.row(q + 3);
-
-            int p = 0;
-            for (; p + 3 < num_input; p += 4)
-            {
-                // transpose 4x4
-                __m128 _r0 = _mm_loadu_ps(k0);
-                __m128 _r1 = _mm_loadu_ps(k1);
-                __m128 _r2 = _mm_loadu_ps(k2);
-                __m128 _r3 = _mm_loadu_ps(k3);
-                _MM_TRANSPOSE4_PS(_r0, _r1, _r2, _r3);
-                __m256 _r01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r0), _r1, 1);
-                __m256 _r23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r2), _r3, 1);
-                __m128i _r01_fp16 = _mm256_cvtps_ph(_r01, _MM_FROUND_TRUNC);
-                __m128i _r23_fp16 = _mm256_cvtps_ph(_r23, _MM_FROUND_TRUNC);
-                _mm_storeu_si128((__m128i*)g0, _r01_fp16);
-                _mm_storeu_si128((__m128i*)(g0 + 8), _r23_fp16);
-
-                k0 += 4;
-                k1 += 4;
-                k2 += 4;
-                k3 += 4;
-                g0 += 16;
-            }
-            for (; p < num_input; p++)
-            {
-                g0[0] = float32_to_float16(*k0++);
-                g0[1] = float32_to_float16(*k1++);
-                g0[2] = float32_to_float16(*k2++);
-                g0[3] = float32_to_float16(*k3++);
-                g0 += 4;
-            }
-        }
-    }
-
-    if (out_elempack == 1)
-    {
-        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
-        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
-    }
-#else  // __F16C__
-    (void)weight_data;
-    (void)weight_data_tm;
-    (void)num_input;
-    (void)num_output;
-    (void)opt;
-#endif // __F16C__
-}
diff --git a/src/layer/x86/innerproduct_gemm_fp16s.h b/src/layer/x86/innerproduct_gemm_fp.h
similarity index 79%
rename from src/layer/x86/innerproduct_gemm_fp16s.h
rename to src/layer/x86/innerproduct_gemm_fp.h
index 02b8d66c8c1..68a1a37d75f 100644
--- a/src/layer/x86/innerproduct_gemm_fp16s.h
+++ b/src/layer/x86/innerproduct_gemm_fp.h
@@ -13,20 +13,23 @@
 // specific language governing permissions and limitations under the License.
 
 #if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
-void innerproduct_gemm_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
+void innerproduct_gemm_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
 #endif
 
-static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
+#if NCNN_IMPL_FP16S
+static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
+#else
+static void innerproduct_gemm_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
+#endif
 {
-#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__
+#if NCNN_RUNTIME_CPU && NCNN_IMPL_FP16S && NCNN_F16C && __AVX__ && !__F16C__
     if (ncnn::cpu_support_x86_f16c())
     {
-        innerproduct_gemm_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
+        innerproduct_gemm_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
         return;
     }
-#endif
+#else // NCNN_RUNTIME_CPU
 
-#if __F16C__
     const int num_input = bottom_blob.w;
     const int elempack = bottom_blob.elempack;
     const int num_output = top_blob.w;
@@ -35,18 +38,24 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
     const float* bias_data_ptr = bias_data;
 
     int num_output_elempack = 1;
+#if __SSE2__
     if (opt.use_packing_layout)
     {
 #if __AVX512F__
         num_output_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-#else
+#elif __AVX__
         num_output_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+#else
+        num_output_elempack = num_output % 4 == 0 ? 4 : 1;
 #endif
     }
+#endif // __SSE2__
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int j = 0; j < h; j++)
     {
+#if __SSE2__
+#if __AVX__
 #if __AVX512F__
         if (elempack == 16 && num_output_elempack == 16)
         {
@@ -54,7 +63,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m512 _sum0 = _mm512_setzero_ps();
@@ -99,7 +112,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m512 _vale = _mm512_set1_ps(m[14]);
                     __m512 _valf = _mm512_set1_ps(m[15]);
 
+#if NCNN_IMPL_FP16S
                     __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr));
+#else
+                    __m512 _w = _mm512_loadu_ps(kptr);
+#endif
 
                     _sum0 = _mm512_fmadd_ps(_val0, _w, _sum0);
                     _sum1 = _mm512_fmadd_ps(_val1, _w, _sum1);
@@ -139,7 +156,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 _sume = activation_avx512(_sume, activation_type, activation_params);
                 _sumf = activation_avx512(_sumf, activation_type, activation_params);
 
-                transpose16_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb, _sumc, _sumd, _sume, _sumf);
+                transpose16x16_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb, _sumc, _sumd, _sume, _sumf);
 
                 _mm512_storeu_ps(outptr, _sum0);
                 _mm512_storeu_ps(outptr + 16, _sum1);
@@ -167,7 +184,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m512 _sum = _mm512_setzero_ps();
@@ -181,7 +202,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m512 _val = _mm512_set1_ps(m[0]);
+#if NCNN_IMPL_FP16S
                     __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr));
+#else
+                    __m512 _w = _mm512_loadu_ps(kptr);
+#endif
+
                     _sum = _mm512_fmadd_ps(_val, _w, _sum);
 
                     m += 1;
@@ -201,7 +227,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m512 _sum0 = _mm512_setzero_ps();
@@ -222,8 +252,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m512 _val1 = _mm512_set1_ps(m[1]);
                     __m512 _val2 = _mm512_set1_ps(m[2]);
                     __m512 _val3 = _mm512_set1_ps(m[3]);
-
+#if NCNN_IMPL_FP16S
                     __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr));
+#else
+                    __m512 _w = _mm512_loadu_ps(kptr);
+#endif
 
                     _sum0 = _mm512_fmadd_ps(_val0, _w, _sum0);
                     _sum1 = _mm512_fmadd_ps(_val1, _w, _sum1);
@@ -239,26 +272,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 _sum2 = activation_avx512(_sum2, activation_type, activation_params);
                 _sum3 = activation_avx512(_sum3, activation_type, activation_params);
 
-                // transpose 16x4
-                __m512 _tmp0 = _mm512_unpacklo_ps(_sum0, _sum1);
-                __m512 _tmp1 = _mm512_unpackhi_ps(_sum0, _sum1);
-                __m512 _tmp2 = _mm512_unpacklo_ps(_sum2, _sum3);
-                __m512 _tmp3 = _mm512_unpackhi_ps(_sum2, _sum3);
-
-                __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-
-                _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
-
-                _sum0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                _sum1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                _sum2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                _sum3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                transpose16x4_ps(_sum0, _sum1, _sum2, _sum3);
 
                 _mm512_storeu_ps(outptr, _sum0);
                 _mm512_storeu_ps(outptr + 16, _sum1);
@@ -274,7 +288,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m512 _sum0 = _mm512_setzero_ps();
@@ -303,8 +321,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m512 _val5 = _mm512_set1_ps(m[5]);
                     __m512 _val6 = _mm512_set1_ps(m[6]);
                     __m512 _val7 = _mm512_set1_ps(m[7]);
-
+#if NCNN_IMPL_FP16S
                     __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr));
+#else
+                    __m512 _w = _mm512_loadu_ps(kptr);
+#endif
 
                     _sum0 = _mm512_fmadd_ps(_val0, _w, _sum0);
                     _sum1 = _mm512_fmadd_ps(_val1, _w, _sum1);
@@ -328,42 +349,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 _sum6 = activation_avx512(_sum6, activation_type, activation_params);
                 _sum7 = activation_avx512(_sum7, activation_type, activation_params);
 
-                // transpose 16x8
-                __m512 _tmp0 = _mm512_unpacklo_ps(_sum0, _sum1);
-                __m512 _tmp1 = _mm512_unpackhi_ps(_sum0, _sum1);
-                __m512 _tmp2 = _mm512_unpacklo_ps(_sum2, _sum3);
-                __m512 _tmp3 = _mm512_unpackhi_ps(_sum2, _sum3);
-                __m512 _tmp4 = _mm512_unpacklo_ps(_sum4, _sum5);
-                __m512 _tmp5 = _mm512_unpackhi_ps(_sum4, _sum5);
-                __m512 _tmp6 = _mm512_unpacklo_ps(_sum6, _sum7);
-                __m512 _tmp7 = _mm512_unpackhi_ps(_sum6, _sum7);
-
-                __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
-                __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
-                __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
-
-                _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
-                _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
-                _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
-
-                _sum0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
-                _sum1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
-                _sum2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
-                _sum3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                _sum4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
-                _sum5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                _sum6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
-                _sum7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                transpose16x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);
 
                 _mm512_storeu_ps(outptr, _sum0);
                 _mm512_storeu_ps(outptr + 16, _sum1);
@@ -383,10 +369,17 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = (const float*)weight_data_tm + num_input * p;
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m512 _sum0 = _mm512_setzero_ps();
+                __m512 _sum1 = _mm512_setzero_ps();
+                __m512 _sum2 = _mm512_setzero_ps();
+                __m512 _sum3 = _mm512_setzero_ps();
 
                 if (bias_data_ptr)
                 {
@@ -400,7 +393,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m512 _val1 = _mm512_loadu_ps(m + 16);
                     __m512 _val2 = _mm512_loadu_ps(m + 32);
                     __m512 _val3 = _mm512_loadu_ps(m + 48);
-
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
                     __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1);
                     __m512 _www = _mm512_insertf32x8(_mm512_castps256_ps512(_ww), _ww, 1);
@@ -409,11 +402,17 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m512 _w1 = _mm512_permute_ps(_www, _MM_SHUFFLE(1, 1, 1, 1));
                     __m512 _w2 = _mm512_permute_ps(_www, _MM_SHUFFLE(2, 2, 2, 2));
                     __m512 _w3 = _mm512_permute_ps(_www, _MM_SHUFFLE(3, 3, 3, 3));
+#else
+                    __m512 _w0 = _mm512_set1_ps(kptr[0]);
+                    __m512 _w1 = _mm512_set1_ps(kptr[1]);
+                    __m512 _w2 = _mm512_set1_ps(kptr[2]);
+                    __m512 _w3 = _mm512_set1_ps(kptr[3]);
+#endif
 
                     _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0);
-                    _sum0 = _mm512_fmadd_ps(_val1, _w1, _sum0);
-                    _sum0 = _mm512_fmadd_ps(_val2, _w2, _sum0);
-                    _sum0 = _mm512_fmadd_ps(_val3, _w3, _sum0);
+                    _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1);
+                    _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2);
+                    _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3);
 
                     m += 64;
                     kptr += 4;
@@ -421,13 +420,21 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m512 _val = _mm512_loadu_ps(m);
+#if NCNN_IMPL_FP16S
                     __m512 _w = _mm512_set1_ps(float16_to_float32(kptr[0]));
+#else
+                    __m512 _w = _mm512_set1_ps(kptr[0]);
+#endif
                     _sum0 = _mm512_fmadd_ps(_val, _w, _sum0);
 
                     m += 16;
                     kptr += 1;
                 }
 
+                _sum0 = _mm512_add_ps(_sum0, _sum1);
+                _sum2 = _mm512_add_ps(_sum2, _sum3);
+                _sum0 = _mm512_add_ps(_sum0, _sum2);
+
                 _sum0 = activation_avx512(_sum0, activation_type, activation_params);
 
                 _mm512_storeu_ps(outptr, _sum0);
@@ -441,7 +448,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m512 _sum0 = _mm512_setzero_ps();
@@ -461,7 +472,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m512 _val = _mm512_loadu_ps(m);
-
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
                     __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1);
                     __m512 _www = _mm512_insertf32x8(_mm512_castps256_ps512(_ww), _ww, 1);
@@ -470,6 +481,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m512 _w1 = _mm512_permute_ps(_www, _MM_SHUFFLE(1, 1, 1, 1));
                     __m512 _w2 = _mm512_permute_ps(_www, _MM_SHUFFLE(2, 2, 2, 2));
                     __m512 _w3 = _mm512_permute_ps(_www, _MM_SHUFFLE(3, 3, 3, 3));
+#else
+                    __m512 _w0 = _mm512_set1_ps(kptr[0]);
+                    __m512 _w1 = _mm512_set1_ps(kptr[1]);
+                    __m512 _w2 = _mm512_set1_ps(kptr[2]);
+                    __m512 _w3 = _mm512_set1_ps(kptr[3]);
+#endif
 
                     _sum0 = _mm512_fmadd_ps(_val, _w0, _sum0);
                     _sum1 = _mm512_fmadd_ps(_val, _w1, _sum1);
@@ -499,7 +516,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m512 _sum0 = _mm512_setzero_ps();
@@ -527,7 +548,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m512 _val = _mm512_loadu_ps(m);
-
+#if NCNN_IMPL_FP16S
                     __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr));
                     __m512 _ww = _mm512_castps256_ps512(_w);
                     __m512 _www0 = _mm512_shuffle_f32x4(_ww, _ww, _MM_SHUFFLE(0, 0, 0, 0));
@@ -541,6 +562,16 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m512 _w5 = _mm512_permute_ps(_www1, _MM_SHUFFLE(1, 1, 1, 1));
                     __m512 _w6 = _mm512_permute_ps(_www1, _MM_SHUFFLE(2, 2, 2, 2));
                     __m512 _w7 = _mm512_permute_ps(_www1, _MM_SHUFFLE(3, 3, 3, 3));
+#else
+                    __m512 _w0 = _mm512_set1_ps(kptr[0]);
+                    __m512 _w1 = _mm512_set1_ps(kptr[1]);
+                    __m512 _w2 = _mm512_set1_ps(kptr[2]);
+                    __m512 _w3 = _mm512_set1_ps(kptr[3]);
+                    __m512 _w4 = _mm512_set1_ps(kptr[4]);
+                    __m512 _w5 = _mm512_set1_ps(kptr[5]);
+                    __m512 _w6 = _mm512_set1_ps(kptr[6]);
+                    __m512 _w7 = _mm512_set1_ps(kptr[7]);
+#endif
 
                     _sum0 = _mm512_fmadd_ps(_val, _w0, _sum0);
                     _sum1 = _mm512_fmadd_ps(_val, _w1, _sum1);
@@ -575,6 +606,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 outptr += 128;
             }
         }
+
 #endif // __AVX512F__
 
         if (elempack == 8 && num_output_elempack == 8)
@@ -583,7 +615,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m256 _sum0 = _mm256_setzero_ps();
@@ -611,8 +647,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m256 _val5 = _mm256_broadcast_ss(m + 5);
                     __m256 _val6 = _mm256_broadcast_ss(m + 6);
                     __m256 _val7 = _mm256_broadcast_ss(m + 7);
-
+#if NCNN_IMPL_FP16S
                     __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr));
+#else
+                    __m256 _w = _mm256_loadu_ps(kptr);
+#endif
 
                     _sum0 = _mm256_comp_fmadd_ps(_val0, _w, _sum0);
                     _sum1 = _mm256_comp_fmadd_ps(_val1, _w, _sum1);
@@ -636,7 +675,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 _sum6 = activation_avx(_sum6, activation_type, activation_params);
                 _sum7 = activation_avx(_sum7, activation_type, activation_params);
 
-                transpose8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);
+                transpose8x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);
 
                 _mm256_storeu_ps(outptr, _sum0);
                 _mm256_storeu_ps(outptr + 8, _sum1);
@@ -656,14 +695,21 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
-                __m256 _sum = _mm256_setzero_ps();
+                __m256 _sum0 = _mm256_setzero_ps();
+                __m256 _sum1 = _mm256_setzero_ps();
+                __m256 _sum2 = _mm256_setzero_ps();
+                __m256 _sum3 = _mm256_setzero_ps();
 
                 if (bias_data_ptr)
                 {
-                    _sum = _mm256_loadu_ps(bias_data_ptr + p * 8);
+                    _sum0 = _mm256_loadu_ps(bias_data_ptr + p * 8);
                 }
 
                 int i = 0;
@@ -673,33 +719,47 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m256 _val1 = _mm256_broadcast_ss(m + 1);
                     __m256 _val2 = _mm256_broadcast_ss(m + 2);
                     __m256 _val3 = _mm256_broadcast_ss(m + 3);
-                    __m256 _val4 = _mm256_broadcast_ss(m + 4);
-                    __m256 _val5 = _mm256_broadcast_ss(m + 5);
-                    __m256 _val6 = _mm256_broadcast_ss(m + 6);
-                    __m256 _val7 = _mm256_broadcast_ss(m + 7);
-
+#if NCNN_IMPL_FP16S
                     __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr);
                     __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
-                    __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32));
-                    __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48));
-
                     __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0));
                     __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1));
                     __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0));
                     __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1));
+#else
+                    __m256 _w0 = _mm256_loadu_ps(kptr);
+                    __m256 _w1 = _mm256_loadu_ps(kptr + 8);
+                    __m256 _w2 = _mm256_loadu_ps(kptr + 16);
+                    __m256 _w3 = _mm256_loadu_ps(kptr + 24);
+#endif
+
+                    _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
+                    _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
+                    _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
+                    _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
+
+                    __m256 _val4 = _mm256_broadcast_ss(m + 4);
+                    __m256 _val5 = _mm256_broadcast_ss(m + 5);
+                    __m256 _val6 = _mm256_broadcast_ss(m + 6);
+                    __m256 _val7 = _mm256_broadcast_ss(m + 7);
+#if NCNN_IMPL_FP16S
+                    __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32));
+                    __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48));
                     __m256 _w4 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 0));
                     __m256 _w5 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 1));
                     __m256 _w6 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 0));
                     __m256 _w7 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 1));
+#else
+                    __m256 _w4 = _mm256_loadu_ps(kptr + 32);
+                    __m256 _w5 = _mm256_loadu_ps(kptr + 40);
+                    __m256 _w6 = _mm256_loadu_ps(kptr + 48);
+                    __m256 _w7 = _mm256_loadu_ps(kptr + 56);
+#endif
 
-                    _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val4, _w4, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val5, _w5, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val6, _w6, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val7, _w7, _sum);
+                    _sum0 = _mm256_comp_fmadd_ps(_val4, _w4, _sum0);
+                    _sum1 = _mm256_comp_fmadd_ps(_val5, _w5, _sum1);
+                    _sum2 = _mm256_comp_fmadd_ps(_val6, _w6, _sum2);
+                    _sum3 = _mm256_comp_fmadd_ps(_val7, _w7, _sum3);
 
                     m += 8;
                     kptr += 64;
@@ -710,19 +770,24 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m256 _val1 = _mm256_broadcast_ss(m + 1);
                     __m256 _val2 = _mm256_broadcast_ss(m + 2);
                     __m256 _val3 = _mm256_broadcast_ss(m + 3);
-
+#if NCNN_IMPL_FP16S
                     __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr);
                     __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16));
-
                     __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0));
                     __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1));
                     __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0));
                     __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1));
+#else
+                    __m256 _w0 = _mm256_loadu_ps(kptr);
+                    __m256 _w1 = _mm256_loadu_ps(kptr + 8);
+                    __m256 _w2 = _mm256_loadu_ps(kptr + 16);
+                    __m256 _w3 = _mm256_loadu_ps(kptr + 24);
+#endif
 
-                    _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum);
-                    _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum);
+                    _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
+                    _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
+                    _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
+                    _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
 
                     m += 4;
                     kptr += 32;
@@ -730,16 +795,24 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m256 _val = _mm256_set1_ps(m[0]);
+#if NCNN_IMPL_FP16S
                     __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr));
-                    _sum = _mm256_comp_fmadd_ps(_val, _w, _sum);
+#else
+                    __m256 _w = _mm256_loadu_ps(kptr);
+#endif
+                    _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0);
 
                     m += 1;
                     kptr += 8;
                 }
 
-                _sum = activation_avx(_sum, activation_type, activation_params);
+                _sum0 = _mm256_add_ps(_sum0, _sum1);
+                _sum2 = _mm256_add_ps(_sum2, _sum3);
+                _sum0 = _mm256_add_ps(_sum0, _sum2);
+
+                _sum0 = activation_avx(_sum0, activation_type, activation_params);
 
-                _mm256_storeu_ps(outptr, _sum);
+                _mm256_storeu_ps(outptr, _sum0);
                 outptr += 8;
             }
         }
@@ -750,7 +823,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m256 _sum0 = _mm256_setzero_ps();
@@ -771,8 +848,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m256 _val1 = _mm256_broadcast_ss(m + 1);
                     __m256 _val2 = _mm256_broadcast_ss(m + 2);
                     __m256 _val3 = _mm256_broadcast_ss(m + 3);
-
+#if NCNN_IMPL_FP16S
                     __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr));
+#else
+                    __m256 _w = _mm256_loadu_ps(kptr);
+#endif
 
                     _sum0 = _mm256_comp_fmadd_ps(_val0, _w, _sum0);
                     _sum1 = _mm256_comp_fmadd_ps(_val1, _w, _sum1);
@@ -788,19 +868,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 _sum2 = activation_avx(_sum2, activation_type, activation_params);
                 _sum3 = activation_avx(_sum3, activation_type, activation_params);
 
-                // transpose 8x4
-                __m256 _tmp0 = _mm256_unpacklo_ps(_sum0, _sum1);
-                __m256 _tmp1 = _mm256_unpackhi_ps(_sum0, _sum1);
-                __m256 _tmp2 = _mm256_unpacklo_ps(_sum2, _sum3);
-                __m256 _tmp3 = _mm256_unpackhi_ps(_sum2, _sum3);
-                __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
-                __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
-                __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
-                _sum0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
-                _sum1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
-                _sum2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
-                _sum3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+                transpose8x4_ps(_sum0, _sum1, _sum2, _sum3);
 
                 _mm256_storeu_ps(outptr, _sum0);
                 _mm256_storeu_ps(outptr + 8, _sum1);
@@ -816,7 +884,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = (const float*)weight_data_tm + num_input * p;
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m256 _sum0 = _mm256_setzero_ps();
@@ -836,7 +908,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m256 _val1 = _mm256_loadu_ps(m + 8);
                     __m256 _val2 = _mm256_loadu_ps(m + 16);
                     __m256 _val3 = _mm256_loadu_ps(m + 24);
-
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
                     __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1);
 
@@ -844,6 +916,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m256 _w1 = _mm256_permute_ps(_ww, _MM_SHUFFLE(1, 1, 1, 1));
                     __m256 _w2 = _mm256_permute_ps(_ww, _MM_SHUFFLE(2, 2, 2, 2));
                     __m256 _w3 = _mm256_permute_ps(_ww, _MM_SHUFFLE(3, 3, 3, 3));
+#else
+                    __m256 _w0 = _mm256_set1_ps(kptr[0]);
+                    __m256 _w1 = _mm256_set1_ps(kptr[1]);
+                    __m256 _w2 = _mm256_set1_ps(kptr[2]);
+                    __m256 _w3 = _mm256_set1_ps(kptr[3]);
+#endif
 
                     _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
                     _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
@@ -856,8 +934,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m256 _val = _mm256_loadu_ps(m);
-                    __m256 _k = _mm256_set1_ps(float16_to_float32(kptr[0]));
-                    _sum0 = _mm256_comp_fmadd_ps(_val, _k, _sum0);
+#if NCNN_IMPL_FP16S
+                    __m256 _w = _mm256_set1_ps(float16_to_float32(kptr[0]));
+#else
+                    __m256 _w = _mm256_set1_ps(kptr[0]);
+#endif
+                    _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0);
 
                     m += 8;
                     kptr += 1;
@@ -880,7 +962,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m256 _sum0 = _mm256_setzero_ps();
@@ -900,7 +986,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m256 _val = _mm256_loadu_ps(m);
-
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
                     __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1);
 
@@ -908,6 +994,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m256 _w1 = _mm256_permute_ps(_ww, _MM_SHUFFLE(1, 1, 1, 1));
                     __m256 _w2 = _mm256_permute_ps(_ww, _MM_SHUFFLE(2, 2, 2, 2));
                     __m256 _w3 = _mm256_permute_ps(_ww, _MM_SHUFFLE(3, 3, 3, 3));
+#else
+                    __m256 _w0 = _mm256_set1_ps(kptr[0]);
+                    __m256 _w1 = _mm256_set1_ps(kptr[1]);
+                    __m256 _w2 = _mm256_set1_ps(kptr[2]);
+                    __m256 _w3 = _mm256_set1_ps(kptr[3]);
+#endif
 
                     _sum0 = _mm256_comp_fmadd_ps(_val, _w0, _sum0);
                     _sum1 = _mm256_comp_fmadd_ps(_val, _w1, _sum1);
@@ -930,6 +1022,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 outptr += 32;
             }
         }
+#endif // __AVX__
 
         if (elempack == 4 && num_output_elempack == 4)
         {
@@ -937,7 +1030,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m128 _sum0 = _mm_setzero_ps();
@@ -958,9 +1055,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m128 _val1 = _mm_set1_ps(m[1]);
                     __m128 _val2 = _mm_set1_ps(m[2]);
                     __m128 _val3 = _mm_set1_ps(m[3]);
-
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
-
+#else
+                    __m128 _w = _mm_loadu_ps(kptr);
+#endif
                     _sum0 = _mm_comp_fmadd_ps(_val0, _w, _sum0);
                     _sum1 = _mm_comp_fmadd_ps(_val1, _w, _sum1);
                     _sum2 = _mm_comp_fmadd_ps(_val2, _w, _sum2);
@@ -991,7 +1090,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output / num_output_elempack; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = weight_data_tm.row(p);
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m128 _sum = _mm_setzero_ps();
@@ -1005,7 +1108,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m128 _val = _mm_set1_ps(m[0]);
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
+#else
+                    __m128 _w = _mm_loadu_ps(kptr);
+#endif
                     _sum = _mm_comp_fmadd_ps(_val, _w, _sum);
 
                     m += 1;
@@ -1025,7 +1132,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = (const float*)weight_data_tm + num_input * p;
+#endif
                 const float* m = bottom_blob.row(j);
 
                 __m128 _sum0 = _mm_setzero_ps();
@@ -1045,13 +1156,19 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                     __m128 _val1 = _mm_loadu_ps(m + 4);
                     __m128 _val2 = _mm_loadu_ps(m + 8);
                     __m128 _val3 = _mm_loadu_ps(m + 12);
-
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
 
                     __m128 _w0 = _mm_permute_ps(_w, _MM_SHUFFLE(0, 0, 0, 0));
                     __m128 _w1 = _mm_permute_ps(_w, _MM_SHUFFLE(1, 1, 1, 1));
                     __m128 _w2 = _mm_permute_ps(_w, _MM_SHUFFLE(2, 2, 2, 2));
                     __m128 _w3 = _mm_permute_ps(_w, _MM_SHUFFLE(3, 3, 3, 3));
+#else
+                    __m128 _w0 = _mm_set1_ps(kptr[0]);
+                    __m128 _w1 = _mm_set1_ps(kptr[1]);
+                    __m128 _w2 = _mm_set1_ps(kptr[2]);
+                    __m128 _w3 = _mm_set1_ps(kptr[3]);
+#endif
 
                     _sum0 = _mm_comp_fmadd_ps(_val0, _w0, _sum0);
                     _sum1 = _mm_comp_fmadd_ps(_val1, _w1, _sum1);
@@ -1064,8 +1181,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 for (; i < num_input; i++)
                 {
                     __m128 _val = _mm_loadu_ps(m);
-                    __m128 _k = _mm_set1_ps(float16_to_float32(kptr[0]));
-                    _sum0 = _mm_comp_fmadd_ps(_val, _k, _sum0);
+#if NCNN_IMPL_FP16S
+                    __m128 _w = _mm_set1_ps(float16_to_float32(kptr[0]));
+#else
+                    __m128 _w = _mm_set1_ps(kptr[0]);
+#endif
+                    _sum0 = _mm_comp_fmadd_ps(_val, _w, _sum0);
 
                     m += 4;
                     kptr += 1;
@@ -1081,6 +1202,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 outptr += 4;
             }
         }
+#endif // __SSE2__
 
         if (elempack == 1 && num_output_elempack == 1)
         {
@@ -1088,7 +1210,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
 
             for (int p = 0; p < num_output; p++)
             {
-                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
+#if NCNN_IMPL_FP16S
+                const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+#else
+                const float* kptr = (const float*)weight_data_tm + num_input * p;
+#endif
                 const float* m = bottom_blob.row(j);
 
                 float sum = 0.f;
@@ -1099,33 +1225,54 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
                 }
 
                 int i = 0;
+#if __SSE2__
+#if __AVX__
                 __m256 _sum = _mm256_setzero_ps();
                 for (; i + 7 < num_input; i += 8)
                 {
                     __m256 _m = _mm256_loadu_ps(m);
+#if NCNN_IMPL_FP16S
                     __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr));
+#else
+                    __m256 _w = _mm256_loadu_ps(kptr);
+#endif
                     _sum = _mm256_comp_fmadd_ps(_m, _w, _sum);
 
                     m += 8;
                     kptr += 8;
                 }
+#endif // __AVX__
                 __m128 _suml = _mm_setzero_ps();
                 for (; i + 3 < num_input; i += 4)
                 {
                     __m128 _val = _mm_loadu_ps(m);
+#if NCNN_IMPL_FP16S
                     __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr));
+#else
+                    __m128 _w = _mm_loadu_ps(kptr);
+#endif
                     _suml = _mm_comp_fmadd_ps(_val, _w, _suml);
 
                     m += 4;
                     kptr += 4;
                 }
+#endif // __SSE2__
                 for (; i < num_input; i++)
                 {
+#if NCNN_IMPL_FP16S
                     sum += *m++ * float16_to_float32(*kptr++);
+#else
+                    sum += *m++ * *kptr++;
+#endif
                 }
 
-                sum += _mm256_reduce_add_ps(_sum);
+#if __SSE2__
+#if __AVX__
+                _suml = _mm_add_ps(_suml, _mm256_extractf128_ps(_sum, 1));
+                _suml = _mm_add_ps(_suml, _mm256_castps256_ps128(_sum));
+#endif // __AVX__
                 sum += _mm_reduce_add_ps(_suml);
+#endif // __SSE2__
 
                 sum = activation_ss(sum, activation_type, activation_params);
 
@@ -1134,13 +1281,5 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c
             }
         }
     }
-#else  // __F16C__
-    (void)bottom_blob;
-    (void)top_blob;
-    (void)weight_data_fp16;
-    (void)bias_data;
-    (void)activation_type;
-    (void)activation_params;
-    (void)opt;
-#endif // __F16C__
+#endif // NCNN_RUNTIME_CPU
 }
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 30f046d7678..c9139bc2a66 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -30,9 +30,14 @@
 
 namespace ncnn {
 
-#if NCNN_F16C
-#include "innerproduct_fp16s.h"
-#include "innerproduct_gemm_fp16s.h"
+#include "innerproduct_fp.h"
+#include "innerproduct_gemm_fp.h"
+
+#if NCNN_F16C && __AVX__
+#define NCNN_IMPL_FP16S 1
+#include "innerproduct_fp.h"
+#include "innerproduct_gemm_fp.h"
+#undef NCNN_IMPL_FP16S
 #endif
 
 InnerProduct_x86::InnerProduct_x86()
@@ -64,7 +69,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
     }
 #endif
 
-#if NCNN_F16C
+#if NCNN_F16C && __AVX__
     if (cpu_support_x86_f16c() && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -73,1306 +78,58 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
 
     const int num_input = weight_data_size / num_output;
 
-    int out_elempack = 1;
-
-#if __SSE2__
-    if (opt.use_packing_layout)
-    {
-#if __AVX512F__
-        out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-#elif __AVX__
-        out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-#else
-        out_elempack = num_output % 4 == 0 ? 4 : 1;
-#endif
-    }
-#endif // __SSE2__
-
-    if (out_elempack != 1)
-    {
-        // src = inch-outch
-        // dst = pb-inch-outch/pb
-        {
-            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
-
-            weight_data_tm.create(num_input, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack);
-
-            for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
-            {
-                float* g0 = weight_data_tm.row(q / out_elempack);
-
-                for (int p = 0; p < num_input; p++)
-                {
-                    for (int j = 0; j < out_elempack; j++)
-                    {
-                        *g0++ = weight_data_r2.row(q + j)[p];
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        weight_data_tm = weight_data;
-    }
-
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
-
-    return 0;
-}
-
-int InnerProduct_x86::destroy_pipeline(const Option& opt)
-{
-    if (flatten)
-    {
-        flatten->destroy_pipeline(opt);
-        delete flatten;
-        flatten = 0;
-    }
-
-    return 0;
-}
-
-int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
-{
-#if NCNN_INT8
-    if (opt.use_int8_inference && int8_scale_term)
-    {
-        return forward_int8_x86(bottom_blob, top_blob, opt);
-    }
-#endif
-
-#if NCNN_F16C
-    if (cpu_support_x86_f16c() && opt.use_fp16_storage)
-    {
-        return forward_fp16s(bottom_blob, top_blob, opt);
-    }
-#endif
-
-    const int num_input = weight_data_size / num_output;
-
-    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
-    {
-        // gemm
-        int h = bottom_blob.h;
-        size_t elemsize = bottom_blob.elemsize;
-        int elempack = bottom_blob.elempack;
-
-        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
-
-        int num_output_elempack = 1;
-#if __SSE2__
-        if (opt.use_packing_layout)
-        {
-#if __AVX512F__
-            num_output_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-#elif __AVX__
-            num_output_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
-#else
-            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
-#endif
-        }
-#endif // __SSE2__
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int j = 0; j < h; j++)
-        {
-#if __SSE2__
-#if __AVX__
-#if __AVX512F__
-            if (elempack == 16 && num_output_elempack == 16)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m512 _sum0 = _mm512_set1_ps(0.f);
-                    __m512 _sum1 = _mm512_set1_ps(0.f);
-                    __m512 _sum2 = _mm512_set1_ps(0.f);
-                    __m512 _sum3 = _mm512_set1_ps(0.f);
-                    __m512 _sum4 = _mm512_set1_ps(0.f);
-                    __m512 _sum5 = _mm512_set1_ps(0.f);
-                    __m512 _sum6 = _mm512_set1_ps(0.f);
-                    __m512 _sum7 = _mm512_set1_ps(0.f);
-                    __m512 _sum8 = _mm512_set1_ps(0.f);
-                    __m512 _sum9 = _mm512_set1_ps(0.f);
-                    __m512 _suma = _mm512_set1_ps(0.f);
-                    __m512 _sumb = _mm512_set1_ps(0.f);
-                    __m512 _sumc = _mm512_set1_ps(0.f);
-                    __m512 _sumd = _mm512_set1_ps(0.f);
-                    __m512 _sume = _mm512_set1_ps(0.f);
-                    __m512 _sumf = _mm512_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm512_set1_ps(bias_data[p * 16 + 0]);
-                        _sum1 = _mm512_set1_ps(bias_data[p * 16 + 1]);
-                        _sum2 = _mm512_set1_ps(bias_data[p * 16 + 2]);
-                        _sum3 = _mm512_set1_ps(bias_data[p * 16 + 3]);
-                        _sum4 = _mm512_set1_ps(bias_data[p * 16 + 4]);
-                        _sum5 = _mm512_set1_ps(bias_data[p * 16 + 5]);
-                        _sum6 = _mm512_set1_ps(bias_data[p * 16 + 6]);
-                        _sum7 = _mm512_set1_ps(bias_data[p * 16 + 7]);
-                        _sum8 = _mm512_set1_ps(bias_data[p * 16 + 8]);
-                        _sum9 = _mm512_set1_ps(bias_data[p * 16 + 9]);
-                        _suma = _mm512_set1_ps(bias_data[p * 16 + 10]);
-                        _sumb = _mm512_set1_ps(bias_data[p * 16 + 11]);
-                        _sumc = _mm512_set1_ps(bias_data[p * 16 + 12]);
-                        _sumd = _mm512_set1_ps(bias_data[p * 16 + 13]);
-                        _sume = _mm512_set1_ps(bias_data[p * 16 + 14]);
-                        _sumf = _mm512_set1_ps(bias_data[p * 16 + 15]);
-                    }
-
-                    for (int i = 0; i < num_input; i++)
-                    {
-                        __m512 _val = _mm512_loadu_ps(m);
-                        _sum0 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[3]), _sum3);
-                        _sum4 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[4]), _sum4);
-                        _sum5 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[5]), _sum5);
-                        _sum6 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[6]), _sum6);
-                        _sum7 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[7]), _sum7);
-                        _sum8 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[8]), _sum8);
-                        _sum9 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[9]), _sum9);
-                        _suma = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[10]), _suma);
-                        _sumb = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[11]), _sumb);
-                        _sumc = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[12]), _sumc);
-                        _sumd = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[13]), _sumd);
-                        _sume = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[14]), _sume);
-                        _sumf = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[15]), _sumf);
-
-                        m += 16;
-                        kptr += 16;
-                    }
-
-                    _sum0 = activation_avx512(_sum0, activation_type, activation_params);
-                    _sum1 = activation_avx512(_sum1, activation_type, activation_params);
-                    _sum2 = activation_avx512(_sum2, activation_type, activation_params);
-                    _sum3 = activation_avx512(_sum3, activation_type, activation_params);
-                    _sum4 = activation_avx512(_sum4, activation_type, activation_params);
-                    _sum5 = activation_avx512(_sum5, activation_type, activation_params);
-                    _sum6 = activation_avx512(_sum6, activation_type, activation_params);
-                    _sum7 = activation_avx512(_sum7, activation_type, activation_params);
-                    _sum8 = activation_avx512(_sum8, activation_type, activation_params);
-                    _sum9 = activation_avx512(_sum9, activation_type, activation_params);
-                    _suma = activation_avx512(_suma, activation_type, activation_params);
-                    _sumb = activation_avx512(_sumb, activation_type, activation_params);
-                    _sumc = activation_avx512(_sumc, activation_type, activation_params);
-                    _sumd = activation_avx512(_sumd, activation_type, activation_params);
-                    _sume = activation_avx512(_sume, activation_type, activation_params);
-                    _sumf = activation_avx512(_sumf, activation_type, activation_params);
-
-                    _mm512_storeu_ps(outptr, _sum0);
-                    _mm512_storeu_ps(outptr + 16, _sum1);
-                    _mm512_storeu_ps(outptr + 16 * 2, _sum2);
-                    _mm512_storeu_ps(outptr + 16 * 3, _sum3);
-                    _mm512_storeu_ps(outptr + 16 * 4, _sum4);
-                    _mm512_storeu_ps(outptr + 16 * 5, _sum5);
-                    _mm512_storeu_ps(outptr + 16 * 6, _sum6);
-                    _mm512_storeu_ps(outptr + 16 * 7, _sum7);
-                    _mm512_storeu_ps(outptr + 16 * 8, _sum8);
-                    _mm512_storeu_ps(outptr + 16 * 9, _sum9);
-                    _mm512_storeu_ps(outptr + 16 * 10, _suma);
-                    _mm512_storeu_ps(outptr + 16 * 11, _sumb);
-                    _mm512_storeu_ps(outptr + 16 * 12, _sumc);
-                    _mm512_storeu_ps(outptr + 16 * 13, _sumd);
-                    _mm512_storeu_ps(outptr + 16 * 14, _sume);
-                    _mm512_storeu_ps(outptr + 16 * 15, _sumf);
-                    outptr += 256;
-                }
-            }
-
-            if (elempack == 1 && num_output_elempack == 16)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m512 _sum = _mm512_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum = _mm512_loadu_ps((const float*)bias_data + p * 16);
-                    }
-
-                    int i = 0;
-                    for (; i < num_input; i++)
-                    {
-                        __m512 _val = _mm512_set1_ps(m[0]);
-                        __m512 _w = _mm512_loadu_ps(kptr);
-                        _sum = _mm512_fmadd_ps(_val, _w, _sum);
-
-                        m += 1;
-                        kptr += 16;
-                    }
-
-                    _sum = activation_avx512(_sum, activation_type, activation_params);
-
-                    _mm512_storeu_ps(outptr, _sum);
-                    outptr += 16;
-                }
-            }
-
-            if (elempack == 4 && num_output_elempack == 16)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m128 _sum0 = _mm_set1_ps(0.f);
-                    __m128 _sum1 = _mm_set1_ps(0.f);
-                    __m128 _sum2 = _mm_set1_ps(0.f);
-                    __m128 _sum3 = _mm_set1_ps(0.f);
-                    __m128 _sum4 = _mm_set1_ps(0.f);
-                    __m128 _sum5 = _mm_set1_ps(0.f);
-                    __m128 _sum6 = _mm_set1_ps(0.f);
-                    __m128 _sum7 = _mm_set1_ps(0.f);
-                    __m128 _sum8 = _mm_set1_ps(0.f);
-                    __m128 _sum9 = _mm_set1_ps(0.f);
-                    __m128 _suma = _mm_set1_ps(0.f);
-                    __m128 _sumb = _mm_set1_ps(0.f);
-                    __m128 _sumc = _mm_set1_ps(0.f);
-                    __m128 _sumd = _mm_set1_ps(0.f);
-                    __m128 _sume = _mm_set1_ps(0.f);
-                    __m128 _sumf = _mm_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm_set1_ps(bias_data[p * 16 + 0]);
-                        _sum1 = _mm_set1_ps(bias_data[p * 16 + 1]);
-                        _sum2 = _mm_set1_ps(bias_data[p * 16 + 2]);
-                        _sum3 = _mm_set1_ps(bias_data[p * 16 + 3]);
-                        _sum4 = _mm_set1_ps(bias_data[p * 16 + 4]);
-                        _sum5 = _mm_set1_ps(bias_data[p * 16 + 5]);
-                        _sum6 = _mm_set1_ps(bias_data[p * 16 + 6]);
-                        _sum7 = _mm_set1_ps(bias_data[p * 16 + 7]);
-                        _sum8 = _mm_set1_ps(bias_data[p * 16 + 8]);
-                        _sum9 = _mm_set1_ps(bias_data[p * 16 + 9]);
-                        _suma = _mm_set1_ps(bias_data[p * 16 + 10]);
-                        _sumb = _mm_set1_ps(bias_data[p * 16 + 11]);
-                        _sumc = _mm_set1_ps(bias_data[p * 16 + 12]);
-                        _sumd = _mm_set1_ps(bias_data[p * 16 + 13]);
-                        _sume = _mm_set1_ps(bias_data[p * 16 + 14]);
-                        _sumf = _mm_set1_ps(bias_data[p * 16 + 15]);
-                    }
-
-                    int i = 0;
-                    for (; i < num_input; i++)
-                    {
-                        __m128 _val = _mm_loadu_ps(m);
-                        _sum0 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[3]), _sum3);
-                        _sum4 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[4]), _sum4);
-                        _sum5 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[5]), _sum5);
-                        _sum6 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[6]), _sum6);
-                        _sum7 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[7]), _sum7);
-                        _sum8 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[8]), _sum8);
-                        _sum9 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[9]), _sum9);
-                        _suma = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[10]), _suma);
-                        _sumb = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[11]), _sumb);
-                        _sumc = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[12]), _sumc);
-                        _sumd = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[13]), _sumd);
-                        _sume = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[14]), _sume);
-                        _sumf = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[15]), _sumf);
-
-                        m += 4;
-                        kptr += 16;
-                    }
-
-                    _sum0 = activation_sse(_sum0, activation_type, activation_params);
-                    _sum1 = activation_sse(_sum1, activation_type, activation_params);
-                    _sum2 = activation_sse(_sum2, activation_type, activation_params);
-                    _sum3 = activation_sse(_sum3, activation_type, activation_params);
-                    _sum4 = activation_sse(_sum4, activation_type, activation_params);
-                    _sum5 = activation_sse(_sum5, activation_type, activation_params);
-                    _sum6 = activation_sse(_sum6, activation_type, activation_params);
-                    _sum7 = activation_sse(_sum7, activation_type, activation_params);
-                    _sum8 = activation_sse(_sum8, activation_type, activation_params);
-                    _sum9 = activation_sse(_sum9, activation_type, activation_params);
-                    _suma = activation_sse(_suma, activation_type, activation_params);
-                    _sumb = activation_sse(_sumb, activation_type, activation_params);
-                    _sumc = activation_sse(_sumc, activation_type, activation_params);
-                    _sumd = activation_sse(_sumd, activation_type, activation_params);
-                    _sume = activation_sse(_sume, activation_type, activation_params);
-                    _sumf = activation_sse(_sumf, activation_type, activation_params);
-
-                    _mm_storeu_ps(outptr, _sum0);
-                    _mm_storeu_ps(outptr + 4, _sum1);
-                    _mm_storeu_ps(outptr + 4 * 2, _sum2);
-                    _mm_storeu_ps(outptr + 4 * 3, _sum3);
-                    _mm_storeu_ps(outptr + 4 * 4, _sum4);
-                    _mm_storeu_ps(outptr + 4 * 5, _sum5);
-                    _mm_storeu_ps(outptr + 4 * 6, _sum6);
-                    _mm_storeu_ps(outptr + 4 * 7, _sum7);
-                    _mm_storeu_ps(outptr + 4 * 8, _sum8);
-                    _mm_storeu_ps(outptr + 4 * 9, _sum9);
-                    _mm_storeu_ps(outptr + 4 * 10, _suma);
-                    _mm_storeu_ps(outptr + 4 * 11, _sumb);
-                    _mm_storeu_ps(outptr + 4 * 12, _sumc);
-                    _mm_storeu_ps(outptr + 4 * 13, _sumd);
-                    _mm_storeu_ps(outptr + 4 * 14, _sume);
-                    _mm_storeu_ps(outptr + 4 * 15, _sumf);
-                    outptr += 64;
-                }
-            }
-
-            if (elempack == 8 && num_output_elempack == 16)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m256 _sum0 = _mm256_set1_ps(0.f);
-                    __m256 _sum1 = _mm256_set1_ps(0.f);
-                    __m256 _sum2 = _mm256_set1_ps(0.f);
-                    __m256 _sum3 = _mm256_set1_ps(0.f);
-                    __m256 _sum4 = _mm256_set1_ps(0.f);
-                    __m256 _sum5 = _mm256_set1_ps(0.f);
-                    __m256 _sum6 = _mm256_set1_ps(0.f);
-                    __m256 _sum7 = _mm256_set1_ps(0.f);
-                    __m256 _sum8 = _mm256_set1_ps(0.f);
-                    __m256 _sum9 = _mm256_set1_ps(0.f);
-                    __m256 _suma = _mm256_set1_ps(0.f);
-                    __m256 _sumb = _mm256_set1_ps(0.f);
-                    __m256 _sumc = _mm256_set1_ps(0.f);
-                    __m256 _sumd = _mm256_set1_ps(0.f);
-                    __m256 _sume = _mm256_set1_ps(0.f);
-                    __m256 _sumf = _mm256_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm256_set1_ps(bias_data[p * 16 + 0]);
-                        _sum1 = _mm256_set1_ps(bias_data[p * 16 + 1]);
-                        _sum2 = _mm256_set1_ps(bias_data[p * 16 + 2]);
-                        _sum3 = _mm256_set1_ps(bias_data[p * 16 + 3]);
-                        _sum4 = _mm256_set1_ps(bias_data[p * 16 + 4]);
-                        _sum5 = _mm256_set1_ps(bias_data[p * 16 + 5]);
-                        _sum6 = _mm256_set1_ps(bias_data[p * 16 + 6]);
-                        _sum7 = _mm256_set1_ps(bias_data[p * 16 + 7]);
-                        _sum8 = _mm256_set1_ps(bias_data[p * 16 + 8]);
-                        _sum9 = _mm256_set1_ps(bias_data[p * 16 + 9]);
-                        _suma = _mm256_set1_ps(bias_data[p * 16 + 10]);
-                        _sumb = _mm256_set1_ps(bias_data[p * 16 + 11]);
-                        _sumc = _mm256_set1_ps(bias_data[p * 16 + 12]);
-                        _sumd = _mm256_set1_ps(bias_data[p * 16 + 13]);
-                        _sume = _mm256_set1_ps(bias_data[p * 16 + 14]);
-                        _sumf = _mm256_set1_ps(bias_data[p * 16 + 15]);
-                    }
-
-                    int i = 0;
-                    for (; i < num_input; i++)
-                    {
-                        __m256 _val = _mm256_loadu_ps(m);
-                        _sum0 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[3]), _sum3);
-                        _sum4 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[4]), _sum4);
-                        _sum5 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[5]), _sum5);
-                        _sum6 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[6]), _sum6);
-                        _sum7 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[7]), _sum7);
-                        _sum8 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[8]), _sum8);
-                        _sum9 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[9]), _sum9);
-                        _suma = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[10]), _suma);
-                        _sumb = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[11]), _sumb);
-                        _sumc = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[12]), _sumc);
-                        _sumd = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[13]), _sumd);
-                        _sume = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[14]), _sume);
-                        _sumf = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[15]), _sumf);
-
-                        m += 8;
-                        kptr += 16;
-                    }
-
-                    _sum0 = activation_avx(_sum0, activation_type, activation_params);
-                    _sum1 = activation_avx(_sum1, activation_type, activation_params);
-                    _sum2 = activation_avx(_sum2, activation_type, activation_params);
-                    _sum3 = activation_avx(_sum3, activation_type, activation_params);
-                    _sum4 = activation_avx(_sum4, activation_type, activation_params);
-                    _sum5 = activation_avx(_sum5, activation_type, activation_params);
-                    _sum6 = activation_avx(_sum6, activation_type, activation_params);
-                    _sum7 = activation_avx(_sum7, activation_type, activation_params);
-                    _sum8 = activation_avx(_sum8, activation_type, activation_params);
-                    _sum9 = activation_avx(_sum9, activation_type, activation_params);
-                    _suma = activation_avx(_suma, activation_type, activation_params);
-                    _sumb = activation_avx(_sumb, activation_type, activation_params);
-                    _sumc = activation_avx(_sumc, activation_type, activation_params);
-                    _sumd = activation_avx(_sumd, activation_type, activation_params);
-                    _sume = activation_avx(_sume, activation_type, activation_params);
-                    _sumf = activation_avx(_sumf, activation_type, activation_params);
-
-                    _mm256_storeu_ps(outptr, _sum0);
-                    _mm256_storeu_ps(outptr + 8, _sum1);
-                    _mm256_storeu_ps(outptr + 8 * 2, _sum2);
-                    _mm256_storeu_ps(outptr + 8 * 3, _sum3);
-                    _mm256_storeu_ps(outptr + 8 * 4, _sum4);
-                    _mm256_storeu_ps(outptr + 8 * 5, _sum5);
-                    _mm256_storeu_ps(outptr + 8 * 6, _sum6);
-                    _mm256_storeu_ps(outptr + 8 * 7, _sum7);
-                    _mm256_storeu_ps(outptr + 8 * 8, _sum8);
-                    _mm256_storeu_ps(outptr + 8 * 9, _sum9);
-                    _mm256_storeu_ps(outptr + 8 * 10, _suma);
-                    _mm256_storeu_ps(outptr + 8 * 11, _sumb);
-                    _mm256_storeu_ps(outptr + 8 * 12, _sumc);
-                    _mm256_storeu_ps(outptr + 8 * 13, _sumd);
-                    _mm256_storeu_ps(outptr + 8 * 14, _sume);
-                    _mm256_storeu_ps(outptr + 8 * 15, _sumf);
-                    outptr += 128;
-                }
-            }
-
-            if (elempack == 16 && num_output_elempack == 1)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output; p++)
-                {
-                    const float* kptr = (const float*)weight_data_tm + num_input * p;
-                    const float* m = bottom_blob.row(j);
-
-                    __m512 _sum0 = _mm512_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm512_set1_ps(bias_data[p]);
-                    }
-
-                    int i = 0;
-                    for (; i < num_input; i++)
-                    {
-                        __m512 _val = _mm512_loadu_ps(m);
-                        __m512 _k = _mm512_set1_ps(kptr[0]);
-                        _sum0 = _mm512_fmadd_ps(_val, _k, _sum0);
-
-                        m += 16;
-                        kptr += 1;
-                    }
-
-                    _sum0 = activation_avx512(_sum0, activation_type, activation_params);
-
-                    _mm512_storeu_ps(outptr, _sum0);
-                    outptr += 16;
-                }
-            }
-
-            if (elempack == 16 && num_output_elempack == 4)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m512 _sum0 = _mm512_set1_ps(0.f);
-                    __m512 _sum1 = _mm512_set1_ps(0.f);
-                    __m512 _sum2 = _mm512_set1_ps(0.f);
-                    __m512 _sum3 = _mm512_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm512_set1_ps(bias_data[p * 4 + 0]);
-                        _sum1 = _mm512_set1_ps(bias_data[p * 4 + 1]);
-                        _sum2 = _mm512_set1_ps(bias_data[p * 4 + 2]);
-                        _sum3 = _mm512_set1_ps(bias_data[p * 4 + 3]);
-                    }
-
-                    int i = 0;
-                    for (; i < num_input; i++)
-                    {
-                        __m512 _val = _mm512_loadu_ps(m);
-                        _sum0 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[3]), _sum3);
-
-                        m += 16;
-                        kptr += 4;
-                    }
-
-                    _sum0 = activation_avx512(_sum0, activation_type, activation_params);
-                    _sum1 = activation_avx512(_sum1, activation_type, activation_params);
-                    _sum2 = activation_avx512(_sum2, activation_type, activation_params);
-                    _sum3 = activation_avx512(_sum3, activation_type, activation_params);
-
-                    _mm512_storeu_ps(outptr, _sum0);
-                    _mm512_storeu_ps(outptr + 16, _sum1);
-                    _mm512_storeu_ps(outptr + 32, _sum2);
-                    _mm512_storeu_ps(outptr + 48, _sum3);
-                    outptr += 64;
-                }
-            }
-
-            if (elempack == 16 && num_output_elempack == 8)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m512 _sum0 = _mm512_set1_ps(0.f);
-                    __m512 _sum1 = _mm512_set1_ps(0.f);
-                    __m512 _sum2 = _mm512_set1_ps(0.f);
-                    __m512 _sum3 = _mm512_set1_ps(0.f);
-                    __m512 _sum4 = _mm512_set1_ps(0.f);
-                    __m512 _sum5 = _mm512_set1_ps(0.f);
-                    __m512 _sum6 = _mm512_set1_ps(0.f);
-                    __m512 _sum7 = _mm512_set1_ps(0.f);
+    innerproduct_transform_kernel_sse(weight_data, weight_data_tm, num_input, num_output, opt);
 
-                    if (bias_term)
-                    {
-                        _sum0 = _mm512_set1_ps(bias_data[p * 8 + 0]);
-                        _sum1 = _mm512_set1_ps(bias_data[p * 8 + 1]);
-                        _sum2 = _mm512_set1_ps(bias_data[p * 8 + 2]);
-                        _sum3 = _mm512_set1_ps(bias_data[p * 8 + 3]);
-                        _sum4 = _mm512_set1_ps(bias_data[p * 8 + 4]);
-                        _sum5 = _mm512_set1_ps(bias_data[p * 8 + 5]);
-                        _sum6 = _mm512_set1_ps(bias_data[p * 8 + 6]);
-                        _sum7 = _mm512_set1_ps(bias_data[p * 8 + 7]);
-                    }
-
-                    int i = 0;
-                    for (; i < num_input; i++)
-                    {
-                        __m512 _val = _mm512_loadu_ps(m);
-                        _sum0 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[3]), _sum3);
-                        _sum4 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[4]), _sum4);
-                        _sum5 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[5]), _sum5);
-                        _sum6 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[6]), _sum6);
-                        _sum7 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[7]), _sum7);
-
-                        m += 16;
-                        kptr += 8;
-                    }
-
-                    _sum0 = activation_avx512(_sum0, activation_type, activation_params);
-                    _sum1 = activation_avx512(_sum1, activation_type, activation_params);
-                    _sum2 = activation_avx512(_sum2, activation_type, activation_params);
-                    _sum3 = activation_avx512(_sum3, activation_type, activation_params);
-                    _sum4 = activation_avx512(_sum4, activation_type, activation_params);
-                    _sum5 = activation_avx512(_sum5, activation_type, activation_params);
-                    _sum6 = activation_avx512(_sum6, activation_type, activation_params);
-                    _sum7 = activation_avx512(_sum7, activation_type, activation_params);
-
-                    _mm512_storeu_ps(outptr, _sum0);
-                    _mm512_storeu_ps(outptr + 16, _sum1);
-                    _mm512_storeu_ps(outptr + 16 * 2, _sum2);
-                    _mm512_storeu_ps(outptr + 16 * 3, _sum3);
-                    _mm512_storeu_ps(outptr + 16 * 4, _sum4);
-                    _mm512_storeu_ps(outptr + 16 * 5, _sum5);
-                    _mm512_storeu_ps(outptr + 16 * 6, _sum6);
-                    _mm512_storeu_ps(outptr + 16 * 7, _sum7);
-                    outptr += 128;
-                }
-            }
-
-#endif // __AVX512F__
-
-            if (elempack == 8 && num_output_elempack == 8)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m256 _sum0 = _mm256_set1_ps(0.f);
-                    __m256 _sum1 = _mm256_set1_ps(0.f);
-                    __m256 _sum2 = _mm256_set1_ps(0.f);
-                    __m256 _sum3 = _mm256_set1_ps(0.f);
-                    __m256 _sum4 = _mm256_set1_ps(0.f);
-                    __m256 _sum5 = _mm256_set1_ps(0.f);
-                    __m256 _sum6 = _mm256_set1_ps(0.f);
-                    __m256 _sum7 = _mm256_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm256_set1_ps(bias_data[p * 8 + 0]);
-                        _sum1 = _mm256_set1_ps(bias_data[p * 8 + 1]);
-                        _sum2 = _mm256_set1_ps(bias_data[p * 8 + 2]);
-                        _sum3 = _mm256_set1_ps(bias_data[p * 8 + 3]);
-                        _sum4 = _mm256_set1_ps(bias_data[p * 8 + 4]);
-                        _sum5 = _mm256_set1_ps(bias_data[p * 8 + 5]);
-                        _sum6 = _mm256_set1_ps(bias_data[p * 8 + 6]);
-                        _sum7 = _mm256_set1_ps(bias_data[p * 8 + 7]);
-                    }
-
-                    for (int i = 0; i < num_input; i++)
-                    {
-                        __m256 _val = _mm256_loadu_ps(m);
-                        __m256 _k0 = _mm256_set1_ps(kptr[0]);
-                        __m256 _k1 = _mm256_set1_ps(kptr[1]);
-                        __m256 _k2 = _mm256_set1_ps(kptr[2]);
-                        __m256 _k3 = _mm256_set1_ps(kptr[3]);
-                        __m256 _k4 = _mm256_set1_ps(kptr[4]);
-                        __m256 _k5 = _mm256_set1_ps(kptr[5]);
-                        __m256 _k6 = _mm256_set1_ps(kptr[6]);
-                        __m256 _k7 = _mm256_set1_ps(kptr[7]);
-                        _sum0 = _mm256_comp_fmadd_ps(_val, _k0, _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val, _k1, _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val, _k2, _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val, _k3, _sum3);
-                        _sum4 = _mm256_comp_fmadd_ps(_val, _k4, _sum4);
-                        _sum5 = _mm256_comp_fmadd_ps(_val, _k5, _sum5);
-                        _sum6 = _mm256_comp_fmadd_ps(_val, _k6, _sum6);
-                        _sum7 = _mm256_comp_fmadd_ps(_val, _k7, _sum7);
-
-                        m += 8;
-                        kptr += 8;
-                    }
-
-                    _sum0 = activation_avx(_sum0, activation_type, activation_params);
-                    _sum1 = activation_avx(_sum1, activation_type, activation_params);
-                    _sum2 = activation_avx(_sum2, activation_type, activation_params);
-                    _sum3 = activation_avx(_sum3, activation_type, activation_params);
-                    _sum4 = activation_avx(_sum4, activation_type, activation_params);
-                    _sum5 = activation_avx(_sum5, activation_type, activation_params);
-                    _sum6 = activation_avx(_sum6, activation_type, activation_params);
-                    _sum7 = activation_avx(_sum7, activation_type, activation_params);
-
-                    _mm256_storeu_ps(outptr, _sum0);
-                    _mm256_storeu_ps(outptr + 8, _sum1);
-                    _mm256_storeu_ps(outptr + 16, _sum2);
-                    _mm256_storeu_ps(outptr + 24, _sum3);
-                    _mm256_storeu_ps(outptr + 32, _sum4);
-                    _mm256_storeu_ps(outptr + 40, _sum5);
-                    _mm256_storeu_ps(outptr + 48, _sum6);
-                    _mm256_storeu_ps(outptr + 56, _sum7);
-                    outptr += 64;
-                }
-            }
-
-            if (elempack == 1 && num_output_elempack == 8)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m256 _sum = _mm256_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum = _mm256_loadu_ps((const float*)bias_data + p * 8);
-                    }
-
-                    int i = 0;
-                    for (; i + 7 < num_input; i += 8)
-                    {
-                        __m256 _val0 = _mm256_broadcast_ss(m);
-                        __m256 _val1 = _mm256_broadcast_ss(m + 1);
-                        __m256 _val2 = _mm256_broadcast_ss(m + 2);
-                        __m256 _val3 = _mm256_broadcast_ss(m + 3);
-                        __m256 _val4 = _mm256_broadcast_ss(m + 4);
-                        __m256 _val5 = _mm256_broadcast_ss(m + 5);
-                        __m256 _val6 = _mm256_broadcast_ss(m + 6);
-                        __m256 _val7 = _mm256_broadcast_ss(m + 7);
-
-                        __m256 _w0 = _mm256_loadu_ps(kptr);
-                        _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum);
-                        __m256 _w1 = _mm256_loadu_ps(kptr + 8);
-                        _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum);
-                        __m256 _w2 = _mm256_loadu_ps(kptr + 16);
-                        _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum);
-                        __m256 _w3 = _mm256_loadu_ps(kptr + 24);
-                        _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum);
-                        __m256 _w4 = _mm256_loadu_ps(kptr + 32);
-                        _sum = _mm256_comp_fmadd_ps(_val4, _w4, _sum);
-                        __m256 _w5 = _mm256_loadu_ps(kptr + 40);
-                        _sum = _mm256_comp_fmadd_ps(_val5, _w5, _sum);
-                        __m256 _w6 = _mm256_loadu_ps(kptr + 48);
-                        _sum = _mm256_comp_fmadd_ps(_val6, _w6, _sum);
-                        __m256 _w7 = _mm256_loadu_ps(kptr + 56);
-                        _sum = _mm256_comp_fmadd_ps(_val7, _w7, _sum);
-
-                        m += 8;
-                        kptr += 64;
-                    }
-                    for (; i + 3 < num_input; i += 4)
-                    {
-                        __m256 _val0 = _mm256_broadcast_ss(m);
-                        __m256 _val1 = _mm256_broadcast_ss(m + 1);
-                        __m256 _val2 = _mm256_broadcast_ss(m + 2);
-                        __m256 _val3 = _mm256_broadcast_ss(m + 3);
-
-                        __m256 _w0 = _mm256_loadu_ps(kptr);
-                        _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum);
-                        __m256 _w1 = _mm256_loadu_ps(kptr + 8);
-                        _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum);
-                        __m256 _w2 = _mm256_loadu_ps(kptr + 16);
-                        _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum);
-                        __m256 _w3 = _mm256_loadu_ps(kptr + 24);
-                        _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum);
-
-                        m += 4;
-                        kptr += 32;
-                    }
-                    for (; i < num_input; i++)
-                    {
-                        __m256 _val = _mm256_set1_ps(m[0]);
-                        __m256 _w = _mm256_loadu_ps(kptr);
-                        _sum = _mm256_comp_fmadd_ps(_val, _w, _sum);
-
-                        m += 1;
-                        kptr += 8;
-                    }
-
-                    _sum = activation_avx(_sum, activation_type, activation_params);
-
-                    _mm256_storeu_ps(outptr, _sum);
-                    outptr += 8;
-                }
-            }
-
-            if (elempack == 4 && num_output_elempack == 8)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m128 _sum0 = _mm_set1_ps(0.f);
-                    __m128 _sum1 = _mm_set1_ps(0.f);
-                    __m128 _sum2 = _mm_set1_ps(0.f);
-                    __m128 _sum3 = _mm_set1_ps(0.f);
-                    __m128 _sum4 = _mm_set1_ps(0.f);
-                    __m128 _sum5 = _mm_set1_ps(0.f);
-                    __m128 _sum6 = _mm_set1_ps(0.f);
-                    __m128 _sum7 = _mm_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm_set1_ps(bias_data[p * 8 + 0]);
-                        _sum1 = _mm_set1_ps(bias_data[p * 8 + 1]);
-                        _sum2 = _mm_set1_ps(bias_data[p * 8 + 2]);
-                        _sum3 = _mm_set1_ps(bias_data[p * 8 + 3]);
-                        _sum4 = _mm_set1_ps(bias_data[p * 8 + 4]);
-                        _sum5 = _mm_set1_ps(bias_data[p * 8 + 5]);
-                        _sum6 = _mm_set1_ps(bias_data[p * 8 + 6]);
-                        _sum7 = _mm_set1_ps(bias_data[p * 8 + 7]);
-                    }
-
-                    int i = 0;
-                    for (; i < num_input; i++)
-                    {
-                        __m128 _val = _mm_loadu_ps(m);
-                        _sum0 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[3]), _sum3);
-                        _sum4 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[4]), _sum4);
-                        _sum5 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[5]), _sum5);
-                        _sum6 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[6]), _sum6);
-                        _sum7 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[7]), _sum7);
-
-                        m += 4;
-                        kptr += 8;
-                    }
-
-                    _sum0 = activation_sse(_sum0, activation_type, activation_params);
-                    _sum1 = activation_sse(_sum1, activation_type, activation_params);
-                    _sum2 = activation_sse(_sum2, activation_type, activation_params);
-                    _sum3 = activation_sse(_sum3, activation_type, activation_params);
-                    _sum4 = activation_sse(_sum4, activation_type, activation_params);
-                    _sum5 = activation_sse(_sum5, activation_type, activation_params);
-                    _sum6 = activation_sse(_sum6, activation_type, activation_params);
-                    _sum7 = activation_sse(_sum7, activation_type, activation_params);
-
-                    _mm_storeu_ps(outptr, _sum0);
-                    _mm_storeu_ps(outptr + 4, _sum1);
-                    _mm_storeu_ps(outptr + 8, _sum2);
-                    _mm_storeu_ps(outptr + 12, _sum3);
-                    _mm_storeu_ps(outptr + 16, _sum4);
-                    _mm_storeu_ps(outptr + 20, _sum5);
-                    _mm_storeu_ps(outptr + 24, _sum6);
-                    _mm_storeu_ps(outptr + 28, _sum7);
-                    outptr += 32;
-                }
-            }
-
-            if (elempack == 8 && num_output_elempack == 1)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output; p++)
-                {
-                    const float* kptr = (const float*)weight_data_tm + num_input * p;
-                    const float* m = bottom_blob.row(j);
-
-                    __m256 _sum0 = _mm256_set1_ps(0.f);
-                    __m256 _sum1 = _mm256_set1_ps(0.f);
-                    __m256 _sum2 = _mm256_set1_ps(0.f);
-                    __m256 _sum3 = _mm256_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm256_set1_ps(bias_data[p]);
-                    }
-
-                    int i = 0;
-                    for (; i + 7 < num_input; i += 8)
-                    {
-                        __m256 _val0 = _mm256_loadu_ps(m);
-                        __m256 _val1 = _mm256_loadu_ps(m + 8);
-                        __m256 _val2 = _mm256_loadu_ps(m + 16);
-                        __m256 _val3 = _mm256_loadu_ps(m + 24);
-                        __m256 _val4 = _mm256_loadu_ps(m + 32);
-                        __m256 _val5 = _mm256_loadu_ps(m + 40);
-                        __m256 _val6 = _mm256_loadu_ps(m + 48);
-                        __m256 _val7 = _mm256_loadu_ps(m + 56);
-                        _sum0 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[3]), _sum3);
-                        _sum0 = _mm256_comp_fmadd_ps(_val4, _mm256_set1_ps(kptr[4]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val5, _mm256_set1_ps(kptr[5]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val6, _mm256_set1_ps(kptr[6]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val7, _mm256_set1_ps(kptr[7]), _sum3);
-
-                        m += 64;
-                        kptr += 8;
-                    }
-                    for (; i + 3 < num_input; i += 4)
-                    {
-                        __m256 _val0 = _mm256_loadu_ps(m);
-                        __m256 _val1 = _mm256_loadu_ps(m + 8);
-                        __m256 _val2 = _mm256_loadu_ps(m + 16);
-                        __m256 _val3 = _mm256_loadu_ps(m + 24);
-                        _sum0 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[3]), _sum3);
-
-                        m += 32;
-                        kptr += 4;
-                    }
-                    for (; i < num_input; i++)
-                    {
-                        __m256 _val = _mm256_loadu_ps(m);
-                        __m256 _k = _mm256_set1_ps(kptr[0]);
-                        _sum0 = _mm256_comp_fmadd_ps(_val, _k, _sum0);
-
-                        m += 8;
-                        kptr += 1;
-                    }
-
-                    _sum0 = _mm256_add_ps(_sum0, _sum1);
-                    _sum2 = _mm256_add_ps(_sum2, _sum3);
-                    _sum0 = _mm256_add_ps(_sum0, _sum2);
-
-                    _sum0 = activation_avx(_sum0, activation_type, activation_params);
-
-                    _mm256_storeu_ps(outptr, _sum0);
-                    outptr += 8;
-                }
-            }
-
-            if (elempack == 8 && num_output_elempack == 4)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m256 _sum0 = _mm256_set1_ps(0.f);
-                    __m256 _sum1 = _mm256_set1_ps(0.f);
-                    __m256 _sum2 = _mm256_set1_ps(0.f);
-                    __m256 _sum3 = _mm256_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm256_set1_ps(bias_data[p * 4 + 0]);
-                        _sum1 = _mm256_set1_ps(bias_data[p * 4 + 1]);
-                        _sum2 = _mm256_set1_ps(bias_data[p * 4 + 2]);
-                        _sum3 = _mm256_set1_ps(bias_data[p * 4 + 3]);
-                    }
-
-                    int i = 0;
-                    for (; i + 3 < num_input; i += 4)
-                    {
-                        __m256 _val0 = _mm256_loadu_ps(m);
-                        __m256 _val1 = _mm256_loadu_ps(m + 8);
-                        __m256 _val2 = _mm256_loadu_ps(m + 16);
-                        __m256 _val3 = _mm256_loadu_ps(m + 24);
-                        _sum0 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[3]), _sum3);
-                        _sum0 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[4]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[5]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[6]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[7]), _sum3);
-                        kptr += 8;
-
-                        _sum0 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[3]), _sum3);
-                        _sum0 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[4]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[5]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[6]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[7]), _sum3);
-
-                        m += 32;
-                        kptr += 8;
-                    }
-                    for (; i < num_input; i++)
-                    {
-                        __m256 _val = _mm256_loadu_ps(m);
-                        _sum0 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[0]), _sum0);
-                        _sum1 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[1]), _sum1);
-                        _sum2 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[2]), _sum2);
-                        _sum3 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[3]), _sum3);
-
-                        m += 8;
-                        kptr += 4;
-                    }
-
-                    _sum0 = activation_avx(_sum0, activation_type, activation_params);
-                    _sum1 = activation_avx(_sum1, activation_type, activation_params);
-                    _sum2 = activation_avx(_sum2, activation_type, activation_params);
-                    _sum3 = activation_avx(_sum3, activation_type, activation_params);
-
-                    _mm256_storeu_ps(outptr, _sum0);
-                    _mm256_storeu_ps(outptr + 8, _sum1);
-                    _mm256_storeu_ps(outptr + 16, _sum2);
-                    _mm256_storeu_ps(outptr + 24, _sum3);
-                    outptr += 32;
-                }
-            }
-#endif // __AVX__
-
-            if (elempack == 4 && num_output_elempack == 4)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m128 _sum0 = _mm_set1_ps(0.f);
-                    __m128 _sum1 = _mm_set1_ps(0.f);
-                    __m128 _sum2 = _mm_set1_ps(0.f);
-                    __m128 _sum3 = _mm_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm_set1_ps(bias_data[p * 4 + 0]);
-                        _sum1 = _mm_set1_ps(bias_data[p * 4 + 1]);
-                        _sum2 = _mm_set1_ps(bias_data[p * 4 + 2]);
-                        _sum3 = _mm_set1_ps(bias_data[p * 4 + 3]);
-                    }
-
-                    int i = 0;
-                    for (; i + 3 < num_input; i += 4)
-                    {
-                        __m128 _val0 = _mm_loadu_ps(m);
-                        __m128 _val1 = _mm_loadu_ps(m + 4);
-                        __m128 _val2 = _mm_loadu_ps(m + 8);
-                        __m128 _val3 = _mm_loadu_ps(m + 12);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[0])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[1])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[2])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[3])), _sum3);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[4])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[5])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[6])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[7])), _sum3);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[8])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[9])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[10])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[11])), _sum3);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[12])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[13])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[14])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[15])), _sum3);
-
-                        m += 16;
-                        kptr += 16;
-                    }
-                    for (; i < num_input; i++)
-                    {
-                        __m128 _val = _mm_loadu_ps(m);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[0])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[1])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[2])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[3])), _sum3);
-
-                        m += 4;
-                        kptr += 4;
-                    }
-
-                    _sum0 = activation_sse(_sum0, activation_type, activation_params);
-                    _sum1 = activation_sse(_sum1, activation_type, activation_params);
-                    _sum2 = activation_sse(_sum2, activation_type, activation_params);
-                    _sum3 = activation_sse(_sum3, activation_type, activation_params);
-
-                    _mm_storeu_ps(outptr, _sum0);
-                    _mm_storeu_ps(outptr + 4, _sum1);
-                    _mm_storeu_ps(outptr + 8, _sum2);
-                    _mm_storeu_ps(outptr + 12, _sum3);
-                    outptr += 16;
-                }
-            }
-
-            if (elempack == 1 && num_output_elempack == 4)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const float* kptr = weight_data_tm.row(p);
-                    const float* m = bottom_blob.row(j);
-
-                    __m128 _sum = _mm_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum = _mm_loadu_ps((const float*)bias_data + p * 4);
-                    }
-
-                    int i = 0;
-#if __AVX__
-                    for (; i + 7 < num_input; i += 8)
-                    {
-                        __m128 _val0 = _mm_broadcast_ss(m);
-                        __m128 _val1 = _mm_broadcast_ss(m + 1);
-                        __m128 _val2 = _mm_broadcast_ss(m + 2);
-                        __m128 _val3 = _mm_broadcast_ss(m + 3);
-                        __m128 _val4 = _mm_broadcast_ss(m + 4);
-                        __m128 _val5 = _mm_broadcast_ss(m + 5);
-                        __m128 _val6 = _mm_broadcast_ss(m + 6);
-                        __m128 _val7 = _mm_broadcast_ss(m + 7);
-
-                        __m128 _w0 = _mm_loadu_ps(kptr);
-                        _sum = _mm_comp_fmadd_ps(_val0, _w0, _sum);
-                        __m128 _w1 = _mm_loadu_ps(kptr + 4);
-                        _sum = _mm_comp_fmadd_ps(_val1, _w1, _sum);
-                        __m128 _w2 = _mm_loadu_ps(kptr + 8);
-                        _sum = _mm_comp_fmadd_ps(_val2, _w2, _sum);
-                        __m128 _w3 = _mm_loadu_ps(kptr + 12);
-                        _sum = _mm_comp_fmadd_ps(_val3, _w3, _sum);
-                        __m128 _w4 = _mm_loadu_ps(kptr + 16);
-                        _sum = _mm_comp_fmadd_ps(_val4, _w4, _sum);
-                        __m128 _w5 = _mm_loadu_ps(kptr + 20);
-                        _sum = _mm_comp_fmadd_ps(_val5, _w5, _sum);
-                        __m128 _w6 = _mm_loadu_ps(kptr + 24);
-                        _sum = _mm_comp_fmadd_ps(_val6, _w6, _sum);
-                        __m128 _w7 = _mm_loadu_ps(kptr + 28);
-                        _sum = _mm_comp_fmadd_ps(_val7, _w7, _sum);
-
-                        m += 8;
-                        kptr += 32;
-                    }
-#endif // __AVX__
-                    for (; i + 3 < num_input; i += 4)
-                    {
-                        __m128 _val0 = _mm_set1_ps(m[0]);
-                        __m128 _val1 = _mm_set1_ps(m[1]);
-                        __m128 _val2 = _mm_set1_ps(m[2]);
-                        __m128 _val3 = _mm_set1_ps(m[3]);
-
-                        __m128 _w0 = _mm_loadu_ps(kptr);
-                        _sum = _mm_add_ps(_mm_mul_ps(_val0, _w0), _sum);
-                        __m128 _w1 = _mm_loadu_ps(kptr + 4);
-                        _sum = _mm_add_ps(_mm_mul_ps(_val1, _w1), _sum);
-                        __m128 _w2 = _mm_loadu_ps(kptr + 8);
-                        _sum = _mm_add_ps(_mm_mul_ps(_val2, _w2), _sum);
-                        __m128 _w3 = _mm_loadu_ps(kptr + 12);
-                        _sum = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum);
-
-                        m += 4;
-                        kptr += 16;
-                    }
-                    for (; i < num_input; i++)
-                    {
-                        __m128 _val = _mm_set1_ps(m[0]);
-                        __m128 _k = _mm_loadu_ps(kptr);
-                        _sum = _mm_add_ps(_mm_mul_ps(_val, _k), _sum);
-
-                        m += 1;
-                        kptr += 4;
-                    }
-
-                    _sum = activation_sse(_sum, activation_type, activation_params);
-
-                    _mm_storeu_ps(outptr, _sum);
-                    outptr += 4;
-                }
-            }
-
-            if (elempack == 4 && num_output_elempack == 1)
-            {
-                float* outptr = top_blob.row(j);
-
-                for (int p = 0; p < num_output; p++)
-                {
-                    const float* kptr = (const float*)weight_data_tm + num_input * p;
-                    const float* m = bottom_blob.row(j);
-
-                    __m128 _sum0 = _mm_set1_ps(0.f);
-                    __m128 _sum1 = _mm_set1_ps(0.f);
-                    __m128 _sum2 = _mm_set1_ps(0.f);
-                    __m128 _sum3 = _mm_set1_ps(0.f);
-
-                    if (bias_term)
-                    {
-                        _sum0 = _mm_set1_ps(bias_data[p]);
-                    }
-
-                    int i = 0;
-                    for (; i + 7 < num_input; i += 8)
-                    {
-                        __m128 _val0 = _mm_loadu_ps(m);
-                        __m128 _val1 = _mm_loadu_ps(m + 4);
-                        __m128 _val2 = _mm_loadu_ps(m + 8);
-                        __m128 _val3 = _mm_loadu_ps(m + 12);
-                        __m128 _val4 = _mm_loadu_ps(m + 16);
-                        __m128 _val5 = _mm_loadu_ps(m + 20);
-                        __m128 _val6 = _mm_loadu_ps(m + 24);
-                        __m128 _val7 = _mm_loadu_ps(m + 28);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[0])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[1])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[2])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[3])), _sum3);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val4, _mm_set1_ps(kptr[4])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val5, _mm_set1_ps(kptr[5])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val6, _mm_set1_ps(kptr[6])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val7, _mm_set1_ps(kptr[7])), _sum3);
-
-                        m += 32;
-                        kptr += 8;
-                    }
-                    for (; i + 3 < num_input; i += 4)
-                    {
-                        __m128 _val0 = _mm_loadu_ps(m);
-                        __m128 _val1 = _mm_loadu_ps(m + 4);
-                        __m128 _val2 = _mm_loadu_ps(m + 8);
-                        __m128 _val3 = _mm_loadu_ps(m + 12);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[0])), _sum0);
-                        _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[1])), _sum1);
-                        _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[2])), _sum2);
-                        _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[3])), _sum3);
-
-                        m += 16;
-                        kptr += 4;
-                    }
-                    for (; i < num_input; i++)
-                    {
-                        __m128 _val = _mm_loadu_ps(m);
-                        __m128 _k = _mm_set1_ps(kptr[0]);
-                        _sum0 = _mm_add_ps(_mm_mul_ps(_val, _k), _sum0);
-
-                        m += 4;
-                        kptr += 1;
-                    }
-
-                    _sum0 = _mm_add_ps(_sum0, _sum1);
-                    _sum2 = _mm_add_ps(_sum2, _sum3);
-                    _sum0 = _mm_add_ps(_sum0, _sum2);
-
-                    _sum0 = activation_sse(_sum0, activation_type, activation_params);
-
-                    _mm_storeu_ps(outptr, _sum0);
-                    outptr += 4;
-                }
-            }
-#endif // __SSE2__
-
-            if (elempack == 1 && num_output_elempack == 1)
-            {
-                float* outptr = top_blob.row(j);
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
 
-                for (int p = 0; p < num_output; p++)
-                {
-                    const float* kptr = (const float*)weight_data_tm + num_input * p;
-                    const float* m = bottom_blob.row(j);
+    return 0;
+}
 
-                    float sum = 0.f;
+int InnerProduct_x86::destroy_pipeline(const Option& opt)
+{
+    if (flatten)
+    {
+        flatten->destroy_pipeline(opt);
+        delete flatten;
+        flatten = 0;
+    }
 
-                    if (bias_term)
-                    {
-                        sum = bias_data[p];
-                    }
+    return 0;
+}
 
-                    int i = 0;
-#if __SSE2__
-#if __AVX__
-                    __m256 _sum = _mm256_set1_ps(0.f);
-                    for (; i + 7 < num_input; i += 8)
-                    {
-                        __m256 _m = _mm256_loadu_ps(m);
-                        __m256 _w = _mm256_loadu_ps(kptr);
-                        _sum = _mm256_comp_fmadd_ps(_m, _w, _sum);
+int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if NCNN_INT8
+    if (opt.use_int8_inference && int8_scale_term)
+    {
+        return forward_int8_x86(bottom_blob, top_blob, opt);
+    }
+#endif
 
-                        m += 8;
-                        kptr += 8;
-                    }
-#endif // __AVX__
-                    __m128 _suml = _mm_set1_ps(0.f);
-                    for (; i + 3 < num_input; i += 4)
-                    {
-                        __m128 _val = _mm_loadu_ps(m);
-                        __m128 _k = _mm_loadu_ps(kptr);
-                        _suml = _mm_add_ps(_mm_mul_ps(_val, _k), _suml);
+#if NCNN_F16C && __AVX__
+    if (cpu_support_x86_f16c() && opt.use_fp16_storage)
+    {
+        return forward_fp16s(bottom_blob, top_blob, opt);
+    }
+#endif
 
-                        m += 4;
-                        kptr += 4;
-                    }
-#endif // __SSE2__
-                    for (; i < num_input; i++)
-                    {
-                        sum += *m++ * *kptr++;
-                    }
+    const int num_input = weight_data_size / num_output;
 
-#if __SSE2__
-#if __AVX__
-                    sum += _mm256_reduce_add_ps(_sum);
-#endif // __AVX__
-                    sum += _mm_reduce_add_ps(_suml);
-#endif // __SSE2__
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
+    {
+        // gemm
+        int h = bottom_blob.h;
+        size_t elemsize = bottom_blob.elemsize;
+        int elempack = bottom_blob.elempack;
 
-                    sum = activation_ss(sum, activation_type, activation_params);
+        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
 
-                    outptr[0] = sum;
-                    outptr += 1;
-                }
-            }
-        }
+        innerproduct_gemm_sse(bottom_blob, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
 
         return 0;
     }
@@ -1413,602 +170,12 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
     if (top_blob.empty())
         return -100;
 
-#if __SSE2__
-#if __AVX__
-#if __AVX512F__
-    if (out_elempack == 16)
-    {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output / out_elempack; p++)
-        {
-            __m512 _sum0 = _mm512_set1_ps(0.f);
-            __m512 _sum1 = _mm512_set1_ps(0.f);
-            __m512 _sum2 = _mm512_set1_ps(0.f);
-            __m512 _sum3 = _mm512_set1_ps(0.f);
-            __m512 _sum4 = _mm512_set1_ps(0.f);
-            __m512 _sum5 = _mm512_set1_ps(0.f);
-            __m512 _sum6 = _mm512_set1_ps(0.f);
-            __m512 _sum7 = _mm512_set1_ps(0.f);
-
-            if (bias_term)
-            {
-                _sum0 = _mm512_loadu_ps((const float*)bias_data + p * 16);
-            }
-
-            const float* kptr = weight_data_tm.row(p);
-
-            const float* sptr = bottom_blob_flattened;
-
-            int i = 0;
-            for (; i + 7 < num_input; i += 8)
-            {
-                __m512 _val0 = _mm512_set1_ps(sptr[0]);
-                __m512 _val1 = _mm512_set1_ps(sptr[1]);
-                __m512 _val2 = _mm512_set1_ps(sptr[2]);
-                __m512 _val3 = _mm512_set1_ps(sptr[3]);
-                __m512 _val4 = _mm512_set1_ps(sptr[4]);
-                __m512 _val5 = _mm512_set1_ps(sptr[5]);
-                __m512 _val6 = _mm512_set1_ps(sptr[6]);
-                __m512 _val7 = _mm512_set1_ps(sptr[7]);
-
-                __m512 _w0 = _mm512_loadu_ps(kptr + 16 * 0);
-                __m512 _w1 = _mm512_loadu_ps(kptr + 16 * 1);
-                __m512 _w2 = _mm512_loadu_ps(kptr + 16 * 2);
-                __m512 _w3 = _mm512_loadu_ps(kptr + 16 * 3);
-                __m512 _w4 = _mm512_loadu_ps(kptr + 16 * 4);
-                __m512 _w5 = _mm512_loadu_ps(kptr + 16 * 5);
-                __m512 _w6 = _mm512_loadu_ps(kptr + 16 * 6);
-                __m512 _w7 = _mm512_loadu_ps(kptr + 16 * 7);
-
-                _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0);
-                _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1);
-                _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2);
-                _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3);
-                _sum4 = _mm512_fmadd_ps(_val4, _w4, _sum4);
-                _sum5 = _mm512_fmadd_ps(_val5, _w5, _sum5);
-                _sum6 = _mm512_fmadd_ps(_val6, _w6, _sum6);
-                _sum7 = _mm512_fmadd_ps(_val7, _w7, _sum7);
-
-                sptr += 8;
-                kptr += 128;
-            }
-            for (; i + 3 < num_input; i += 4)
-            {
-                __m512 _val0 = _mm512_set1_ps(sptr[0]);
-                __m512 _val1 = _mm512_set1_ps(sptr[1]);
-                __m512 _val2 = _mm512_set1_ps(sptr[2]);
-                __m512 _val3 = _mm512_set1_ps(sptr[3]);
-
-                __m512 _w0 = _mm512_loadu_ps(kptr);
-                __m512 _w1 = _mm512_loadu_ps(kptr + 16);
-                __m512 _w2 = _mm512_loadu_ps(kptr + 32);
-                __m512 _w3 = _mm512_loadu_ps(kptr + 48);
-                _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0);
-                _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1);
-                _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2);
-                _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3);
-
-                sptr += 4;
-                kptr += 64;
-            }
-            for (; i < num_input; i++)
-            {
-                __m512 _val = _mm512_set1_ps(sptr[0]);
-                __m512 _w = _mm512_loadu_ps(kptr);
-                _sum0 = _mm512_fmadd_ps(_val, _w, _sum0);
-
-                sptr += 1;
-                kptr += 16;
-            }
-
-            _sum0 = _mm512_add_ps(_sum0, _sum1);
-            _sum2 = _mm512_add_ps(_sum2, _sum3);
-            _sum4 = _mm512_add_ps(_sum4, _sum5);
-            _sum6 = _mm512_add_ps(_sum6, _sum7);
-            _sum0 = _mm512_add_ps(_sum0, _sum2);
-            _sum4 = _mm512_add_ps(_sum4, _sum6);
-            _sum0 = _mm512_add_ps(_sum0, _sum4);
-
-            _sum0 = activation_avx512(_sum0, activation_type, activation_params);
-
-            float* outptr = top_blob;
-            _mm512_storeu_ps(outptr + p * 16, _sum0);
-        }
-    }
-
-#endif // __AVX512F__
-
-    if (out_elempack == 8)
-    {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output / out_elempack; p++)
-        {
-            __m256 _sum0 = _mm256_set1_ps(0.f);
-            __m256 _sum1 = _mm256_set1_ps(0.f);
-            __m256 _sum2 = _mm256_set1_ps(0.f);
-            __m256 _sum3 = _mm256_set1_ps(0.f);
-            __m256 _sum4 = _mm256_set1_ps(0.f);
-            __m256 _sum5 = _mm256_set1_ps(0.f);
-            __m256 _sum6 = _mm256_set1_ps(0.f);
-            __m256 _sum7 = _mm256_set1_ps(0.f);
-
-            if (bias_term)
-            {
-                _sum0 = _mm256_loadu_ps((const float*)bias_data + p * 8);
-            }
-
-            const float* kptr = weight_data_tm.row(p);
-
-            const float* sptr = bottom_blob_flattened;
-
-            int i = 0;
-            for (; i + 7 < num_input; i += 8)
-            {
-                __m256 _val0 = _mm256_broadcast_ss(sptr);
-                __m256 _val1 = _mm256_broadcast_ss(sptr + 1);
-                __m256 _val2 = _mm256_broadcast_ss(sptr + 2);
-                __m256 _val3 = _mm256_broadcast_ss(sptr + 3);
-                __m256 _val4 = _mm256_broadcast_ss(sptr + 4);
-                __m256 _val5 = _mm256_broadcast_ss(sptr + 5);
-                __m256 _val6 = _mm256_broadcast_ss(sptr + 6);
-                __m256 _val7 = _mm256_broadcast_ss(sptr + 7);
-
-                __m256 _w0 = _mm256_loadu_ps(kptr);
-                _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
-                __m256 _w1 = _mm256_loadu_ps(kptr + 8);
-                _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
-                __m256 _w2 = _mm256_loadu_ps(kptr + 16);
-                _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
-                __m256 _w3 = _mm256_loadu_ps(kptr + 24);
-                _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
-                __m256 _w4 = _mm256_loadu_ps(kptr + 32);
-                _sum4 = _mm256_comp_fmadd_ps(_val4, _w4, _sum4);
-                __m256 _w5 = _mm256_loadu_ps(kptr + 40);
-                _sum5 = _mm256_comp_fmadd_ps(_val5, _w5, _sum5);
-                __m256 _w6 = _mm256_loadu_ps(kptr + 48);
-                _sum6 = _mm256_comp_fmadd_ps(_val6, _w6, _sum6);
-                __m256 _w7 = _mm256_loadu_ps(kptr + 56);
-                _sum7 = _mm256_comp_fmadd_ps(_val7, _w7, _sum7);
-
-                sptr += 8;
-                kptr += 64;
-            }
-            for (; i + 3 < num_input; i += 4)
-            {
-                __m256 _val0 = _mm256_broadcast_ss(sptr);
-                __m256 _val1 = _mm256_broadcast_ss(sptr + 1);
-                __m256 _val2 = _mm256_broadcast_ss(sptr + 2);
-                __m256 _val3 = _mm256_broadcast_ss(sptr + 3);
-
-                __m256 _w0 = _mm256_loadu_ps(kptr);
-                _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0);
-                __m256 _w1 = _mm256_loadu_ps(kptr + 8);
-                _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1);
-                __m256 _w2 = _mm256_loadu_ps(kptr + 16);
-                _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2);
-                __m256 _w3 = _mm256_loadu_ps(kptr + 24);
-                _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3);
-
-                sptr += 4;
-                kptr += 32;
-            }
-            for (; i < num_input; i++)
-            {
-                __m256 _val = _mm256_set1_ps(sptr[0]);
-                __m256 _w = _mm256_loadu_ps(kptr);
-                _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0);
-
-                sptr += 1;
-                kptr += 8;
-            }
-
-            _sum0 = _mm256_add_ps(_sum0, _sum1);
-            _sum2 = _mm256_add_ps(_sum2, _sum3);
-            _sum4 = _mm256_add_ps(_sum4, _sum5);
-            _sum6 = _mm256_add_ps(_sum6, _sum7);
-            _sum0 = _mm256_add_ps(_sum0, _sum2);
-            _sum4 = _mm256_add_ps(_sum4, _sum6);
-            _sum0 = _mm256_add_ps(_sum0, _sum4);
-
-            _sum0 = activation_avx(_sum0, activation_type, activation_params);
-
-            float* outptr = top_blob;
-            _mm256_storeu_ps(outptr + p * 8, _sum0);
-        }
-    }
-#endif // __AVX__
-
-    if (out_elempack == 4)
-    {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output / out_elempack; p++)
-        {
-            __m128 _sum0 = _mm_set1_ps(0.f);
-            __m128 _sum1 = _mm_set1_ps(0.f);
-            __m128 _sum2 = _mm_set1_ps(0.f);
-            __m128 _sum3 = _mm_set1_ps(0.f);
-#if __AVX__
-            __m128 _sum4 = _mm_set1_ps(0.f);
-            __m128 _sum5 = _mm_set1_ps(0.f);
-            __m128 _sum6 = _mm_set1_ps(0.f);
-            __m128 _sum7 = _mm_set1_ps(0.f);
-#endif
-
-            if (bias_term)
-            {
-                _sum0 = _mm_loadu_ps((const float*)bias_data + p * 4);
-            }
-
-            const float* kptr = weight_data_tm.row(p);
-
-            const float* sptr = bottom_blob_flattened;
-
-            int i = 0;
-#if __AVX__
-            for (; i + 7 < num_input; i += 8)
-            {
-                __m128 _val0 = _mm_broadcast_ss(sptr);
-                __m128 _val1 = _mm_broadcast_ss(sptr + 1);
-                __m128 _val2 = _mm_broadcast_ss(sptr + 2);
-                __m128 _val3 = _mm_broadcast_ss(sptr + 3);
-                __m128 _val4 = _mm_broadcast_ss(sptr + 4);
-                __m128 _val5 = _mm_broadcast_ss(sptr + 5);
-                __m128 _val6 = _mm_broadcast_ss(sptr + 6);
-                __m128 _val7 = _mm_broadcast_ss(sptr + 7);
-
-                __m128 _w0 = _mm_loadu_ps(kptr);
-                _sum0 = _mm_comp_fmadd_ps(_val0, _w0, _sum0);
-                __m128 _w1 = _mm_loadu_ps(kptr + 4);
-                _sum1 = _mm_comp_fmadd_ps(_val1, _w1, _sum1);
-                __m128 _w2 = _mm_loadu_ps(kptr + 8);
-                _sum2 = _mm_comp_fmadd_ps(_val2, _w2, _sum2);
-                __m128 _w3 = _mm_loadu_ps(kptr + 12);
-                _sum3 = _mm_comp_fmadd_ps(_val3, _w3, _sum3);
-                __m128 _w4 = _mm_loadu_ps(kptr + 16);
-                _sum4 = _mm_comp_fmadd_ps(_val4, _w4, _sum4);
-                __m128 _w5 = _mm_loadu_ps(kptr + 20);
-                _sum5 = _mm_comp_fmadd_ps(_val5, _w5, _sum5);
-                __m128 _w6 = _mm_loadu_ps(kptr + 24);
-                _sum6 = _mm_comp_fmadd_ps(_val6, _w6, _sum6);
-                __m128 _w7 = _mm_loadu_ps(kptr + 28);
-                _sum7 = _mm_comp_fmadd_ps(_val7, _w7, _sum7);
-
-                sptr += 8;
-                kptr += 32;
-            }
-#endif
-            for (; i + 3 < num_input; i += 4)
-            {
-                __m128 _val0 = _mm_set1_ps(sptr[0]);
-                __m128 _val1 = _mm_set1_ps(sptr[1]);
-                __m128 _val2 = _mm_set1_ps(sptr[2]);
-                __m128 _val3 = _mm_set1_ps(sptr[3]);
-
-                __m128 _w0 = _mm_loadu_ps(kptr);
-                _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _w0), _sum0);
-                __m128 _w1 = _mm_loadu_ps(kptr + 4);
-                _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _w1), _sum1);
-                __m128 _w2 = _mm_loadu_ps(kptr + 8);
-                _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _w2), _sum2);
-                __m128 _w3 = _mm_loadu_ps(kptr + 12);
-                _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3);
-
-                sptr += 4;
-                kptr += 16;
-            }
-            for (; i < num_input; i++)
-            {
-                __m128 _val = _mm_set1_ps(sptr[0]);
-                __m128 _w = _mm_loadu_ps(kptr);
-                _sum0 = _mm_add_ps(_mm_mul_ps(_val, _w), _sum0);
-
-                sptr += 1;
-                kptr += 4;
-            }
-
-            _sum0 = _mm_add_ps(_sum0, _sum1);
-            _sum2 = _mm_add_ps(_sum2, _sum3);
-#if __AVX__
-            _sum4 = _mm_add_ps(_sum4, _sum5);
-            _sum6 = _mm_add_ps(_sum6, _sum7);
-#endif
-            _sum0 = _mm_add_ps(_sum0, _sum2);
-#if __AVX__
-            _sum4 = _mm_add_ps(_sum4, _sum6);
-            _sum0 = _mm_add_ps(_sum0, _sum4);
-#endif
-
-            _sum0 = activation_sse(_sum0, activation_type, activation_params);
-
-            float* outptr = top_blob;
-            _mm_storeu_ps(outptr + p * 4, _sum0);
-        }
-    }
-#endif // __SSE2__
-
-    if (out_elempack == 1)
-    {
-#if __SSE2__
-#if __AVX__
-        int remain_num_output_start = 0;
-        int nn_num_output = num_output >> 3;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int pp = 0; pp < nn_num_output; pp++)
-        {
-            int p = pp * 8;
-
-            float sums[8] = {0.0f};
-            if (bias_term)
-            {
-                sums[0] = bias_data[p];
-                sums[1] = bias_data[p + 1];
-                sums[2] = bias_data[p + 2];
-                sums[3] = bias_data[p + 3];
-                sums[4] = bias_data[p + 4];
-                sums[5] = bias_data[p + 5];
-                sums[6] = bias_data[p + 6];
-                sums[7] = bias_data[p + 7];
-            }
-
-            const float* w0 = (const float*)weight_data_tm + num_input * p;
-            const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
-            const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
-            const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);
-            const float* w4 = (const float*)weight_data_tm + num_input * (p + 4);
-            const float* w5 = (const float*)weight_data_tm + num_input * (p + 5);
-            const float* w6 = (const float*)weight_data_tm + num_input * (p + 6);
-            const float* w7 = (const float*)weight_data_tm + num_input * (p + 7);
-
-            const float* m = bottom_blob_flattened;
-
-            __m256 _sum0 = _mm256_set1_ps(0.f);
-            __m256 _sum1 = _mm256_set1_ps(0.f);
-            __m256 _sum2 = _mm256_set1_ps(0.f);
-            __m256 _sum3 = _mm256_set1_ps(0.f);
-            __m256 _sum4 = _mm256_set1_ps(0.f);
-            __m256 _sum5 = _mm256_set1_ps(0.f);
-            __m256 _sum6 = _mm256_set1_ps(0.f);
-            __m256 _sum7 = _mm256_set1_ps(0.f);
-
-            int i = 0;
-            for (; i + 7 < num_input; i += 8)
-            {
-                __m256 _m = _mm256_loadu_ps(m);
-
-                __m256 _w0 = _mm256_loadu_ps(w0);
-                _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0);
-                __m256 _w1 = _mm256_loadu_ps(w1);
-                _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1);
-                __m256 _w2 = _mm256_loadu_ps(w2);
-                _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2);
-                __m256 _w3 = _mm256_loadu_ps(w3);
-                _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3);
-                __m256 _w4 = _mm256_loadu_ps(w4);
-                _sum4 = _mm256_comp_fmadd_ps(_m, _w4, _sum4);
-                __m256 _w5 = _mm256_loadu_ps(w5);
-                _sum5 = _mm256_comp_fmadd_ps(_m, _w5, _sum5);
-                __m256 _w6 = _mm256_loadu_ps(w6);
-                _sum6 = _mm256_comp_fmadd_ps(_m, _w6, _sum6);
-                __m256 _w7 = _mm256_loadu_ps(w7);
-                _sum7 = _mm256_comp_fmadd_ps(_m, _w7, _sum7);
-
-                m += 8;
-                w0 += 8;
-                w1 += 8;
-                w2 += 8;
-                w3 += 8;
-                w4 += 8;
-                w5 += 8;
-                w6 += 8;
-                w7 += 8;
-            }
-            for (; i < num_input; i++)
-            {
-                sums[0] += *m * *w0;
-                sums[1] += *m * *w1;
-                sums[2] += *m * *w2;
-                sums[3] += *m * *w3;
-                sums[4] += *m * *w4;
-                sums[5] += *m * *w5;
-                sums[6] += *m * *w6;
-                sums[7] += *m * *w7;
-
-                m++;
-                w0++;
-                w1++;
-                w2++;
-                w3++;
-                w4++;
-                w5++;
-                w6++;
-                w7++;
-            }
-
-            __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);
-            __m256 _sums_f = _mm256_loadu_ps(sums);
-            _sums = _mm256_add_ps(_sums_f, _sums);
-            _sums = activation_avx(_sums, activation_type, activation_params);
-
-            float* outptr = top_blob;
-            _mm256_storeu_ps(outptr + p, _sums);
-        }
-
-        remain_num_output_start += (nn_num_output << 3);
-        nn_num_output = (num_output - remain_num_output_start) >> 2;
-#else
-        int remain_num_output_start = 0;
-        int nn_num_output = num_output >> 2;
-#endif // __AVX__
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int pp = 0; pp < nn_num_output; pp++)
-        {
-            int p = remain_num_output_start + (pp * 4);
-
-            float sums[4] = {0.0f};
-            if (bias_term)
-            {
-                sums[0] = bias_data[p];
-                sums[1] = bias_data[p + 1];
-                sums[2] = bias_data[p + 2];
-                sums[3] = bias_data[p + 3];
-            }
-
-            const float* w0 = (const float*)weight_data_tm + num_input * p;
-            const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
-            const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
-            const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);
-
-            const float* m = bottom_blob_flattened;
-
-            int i = 0;
-#if __AVX__
-            __m256 _sum0 = _mm256_set1_ps(0.f);
-            __m256 _sum1 = _mm256_set1_ps(0.f);
-            __m256 _sum2 = _mm256_set1_ps(0.f);
-            __m256 _sum3 = _mm256_set1_ps(0.f);
-            for (; i + 7 < num_input; i += 8)
-            {
-                __m256 _m = _mm256_loadu_ps(m);
-
-                __m256 _w0 = _mm256_loadu_ps(w0);
-                _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0);
-                __m256 _w1 = _mm256_loadu_ps(w1);
-                _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1);
-                __m256 _w2 = _mm256_loadu_ps(w2);
-                _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2);
-                __m256 _w3 = _mm256_loadu_ps(w3);
-                _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3);
-
-                m += 8;
-                w0 += 8;
-                w1 += 8;
-                w2 += 8;
-                w3 += 8;
-            }
-#endif // __AVX__
-            __m128 _sum0l = _mm_set1_ps(0.f);
-            __m128 _sum1l = _mm_set1_ps(0.f);
-            __m128 _sum2l = _mm_set1_ps(0.f);
-            __m128 _sum3l = _mm_set1_ps(0.f);
-            for (; i + 3 < num_input; i += 4)
-            {
-                __m128 _m = _mm_loadu_ps(m);
-
-                __m128 _w0 = _mm_loadu_ps(w0);
-                _sum0l = _mm_add_ps(_mm_mul_ps(_m, _w0), _sum0l);
-                __m128 _w1 = _mm_loadu_ps(w1);
-                _sum1l = _mm_add_ps(_mm_mul_ps(_m, _w1), _sum1l);
-                __m128 _w2 = _mm_loadu_ps(w2);
-                _sum2l = _mm_add_ps(_mm_mul_ps(_m, _w2), _sum2l);
-                __m128 _w3 = _mm_loadu_ps(w3);
-                _sum3l = _mm_add_ps(_mm_mul_ps(_m, _w3), _sum3l);
-
-                m += 4;
-                w0 += 4;
-                w1 += 4;
-                w2 += 4;
-                w3 += 4;
-            }
-            for (; i < num_input; i++)
-            {
-                sums[0] += *m * *w0;
-                sums[1] += *m * *w1;
-                sums[2] += *m * *w2;
-                sums[3] += *m * *w3;
-
-                m++;
-                w0++;
-                w1++;
-                w2++;
-                w3++;
-            }
-
-            __m128 _sums = _mm_loadu_ps(sums);
-#if __AVX__
-            _sums = _mm_add_ps(HorizontalSums(_sum0, _sum1, _sum2, _sum3), _sums);
-#endif
-            _MM_TRANSPOSE4_PS(_sum0l, _sum1l, _sum2l, _sum3l);
-            _sums = _mm_add_ps(_sum0l, _sums);
-            _sums = _mm_add_ps(_sum1l, _sums);
-            _sums = _mm_add_ps(_sum2l, _sums);
-            _sums = _mm_add_ps(_sum3l, _sums);
-            _sums = activation_sse(_sums, activation_type, activation_params);
-
-            float* outptr = top_blob;
-            _mm_storeu_ps(outptr + p, _sums);
-        }
-
-        remain_num_output_start += (nn_num_output << 2);
-#else
-        int remain_num_output_start = 0;
-#endif // __SSE2__
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = remain_num_output_start; p < num_output; p++)
-        {
-            float sum = 0.f;
-
-            if (bias_term)
-                sum = bias_data[p];
-
-            const float* w = (const float*)weight_data_tm + num_input * p;
-
-            const float* m = bottom_blob_flattened;
-
-            int i = 0;
-#if __SSE2__
-#if __AVX__
-            __m256 _sum = _mm256_set1_ps(0.f);
-            for (; i + 7 < num_input; i += 8)
-            {
-                __m256 _m = _mm256_loadu_ps(m);
-
-                __m256 _w = _mm256_loadu_ps(w);
-                _sum = _mm256_comp_fmadd_ps(_m, _w, _sum);
-
-                m += 8;
-                w += 8;
-            }
-#endif // __AVX__
-            __m128 _suml = _mm_set1_ps(0.f);
-            for (; i + 3 < num_input; i += 4)
-            {
-                __m128 _m = _mm_loadu_ps(m);
-
-                __m128 _w = _mm_loadu_ps(w);
-                _suml = _mm_add_ps(_mm_mul_ps(_m, _w), _suml);
-
-                m += 4;
-                w += 4;
-            }
-#endif // __SSE2__
-            for (; i < num_input; i++)
-            {
-                sum += *m * *w;
-                m++;
-                w++;
-            }
-
-#if __SSE2__
-#if __AVX__
-            sum += _mm256_reduce_add_ps(_sum);
-#endif
-            sum += _mm_reduce_add_ps(_suml);
-#endif // __SSE2__
-
-            sum = activation_ss(sum, activation_type, activation_params);
-
-            float* outptr = top_blob;
-            outptr[p] = sum;
-        }
-    }
+    innerproduct_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
 
     return 0;
 }
 
-#if NCNN_F16C
+#if NCNN_F16C && __AVX__
 int InnerProduct_x86::create_pipeline_fp16s(const Option& opt)
 {
     const int num_input = weight_data_size / num_output;
@@ -2071,31 +238,11 @@ int InnerProduct_x86::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
     if (top_blob.empty())
         return -100;
 
-#if __AVX512F__
-    if (out_elempack == 16)
-    {
-        innerproduct_fp16s_pack16_avx512(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
-    }
-#endif // __AVX512F__
-
-    if (out_elempack == 8)
-    {
-        innerproduct_fp16s_pack8_avx(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
-    }
-
-    if (out_elempack == 4)
-    {
-        innerproduct_fp16s_pack4_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
-    }
-
-    if (out_elempack == 1)
-    {
-        innerproduct_fp16s_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
-    }
+    innerproduct_fp16s_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
 
     return 0;
 }
-#endif // NCNN_F16C
+#endif // NCNN_F16C && __AVX__
 
 #if NCNN_INT8
 int InnerProduct_x86::create_pipeline_int8_x86(const Option& opt)
diff --git a/src/layer/x86/innerproduct_x86.h b/src/layer/x86/innerproduct_x86.h
index ab4f26f1380..211131e6132 100644
--- a/src/layer/x86/innerproduct_x86.h
+++ b/src/layer/x86/innerproduct_x86.h
@@ -30,7 +30,7 @@ class InnerProduct_x86 : virtual public InnerProduct
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_F16C
+#if NCNN_F16C && __AVX__
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/x86/innerproduct_x86_f16c.cpp b/src/layer/x86/innerproduct_x86_f16c.cpp
index efa2625997f..e6a942aa364 100644
--- a/src/layer/x86/innerproduct_x86_f16c.cpp
+++ b/src/layer/x86/innerproduct_x86_f16c.cpp
@@ -26,18 +26,10 @@
 
 namespace ncnn {
 
-#include "innerproduct_fp16s.h"
-#include "innerproduct_gemm_fp16s.h"
-
-void innerproduct_fp16s_pack8_avx_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
-{
-    innerproduct_fp16s_pack8_avx(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
-}
-
-void innerproduct_fp16s_pack4_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
-{
-    innerproduct_fp16s_pack4_sse(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
-}
+#define NCNN_IMPL_FP16S 1
+#include "innerproduct_fp.h"
+#include "innerproduct_gemm_fp.h"
+#undef NCNN_IMPL_FP16S
 
 void innerproduct_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
 {
diff --git a/src/layer/x86/layernorm_x86.cpp b/src/layer/x86/layernorm_x86.cpp
index 3f6a66a5ec0..27ded2b4e3a 100644
--- a/src/layer/x86/layernorm_x86.cpp
+++ b/src/layer/x86/layernorm_x86.cpp
@@ -119,7 +119,7 @@ static NCNN_FORCEINLINE void fast_mean(float* ptr, float* mean, int elempack, in
 #if __SSE2__
 #if __AVX__
 #if __AVX512F__
-        sum += _mm512_reduce_add_ps(_sum_512);
+        sum += _mm512_comp_reduce_add_ps(_sum_512);
 #endif // __AVX512F__
         sum += _mm256_reduce_add_ps(_sum_256);
 #endif // __AVX__
@@ -230,7 +230,7 @@ static NCNN_FORCEINLINE void fast_var(float* ptr, float* var, const float* mean,
 #if __SSE2__
 #if __AVX__
 #if __AVX512F__
-        sq_sum += _mm512_reduce_add_ps(_sq_sum_512);
+        sq_sum += _mm512_comp_reduce_add_ps(_sq_sum_512);
 #endif // __AVX512F__
         sq_sum += _mm256_reduce_add_ps(_sq_sum_256);
 #endif // __AVX__
diff --git a/src/layer/x86/lstm_x86.cpp b/src/layer/x86/lstm_x86.cpp
index 59124f7907a..53c8bfe902b 100644
--- a/src/layer/x86/lstm_x86.cpp
+++ b/src/layer/x86/lstm_x86.cpp
@@ -14,6 +14,13 @@
 
 #include "lstm_x86.h"
 
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif // __SSE2__
+
 #include "x86_activation.h"
 #include "x86_usability.h"
 
@@ -30,23 +37,183 @@ LSTM_x86::LSTM_x86()
 
 int LSTM_x86::create_pipeline(const Option& opt)
 {
-    (void)(opt);
+    // pack IFOG
+    int num_directions = direction == 2 ? 2 : 1;
+    int size = weight_data_size / num_directions / hidden_size / 4;
+
+#if __AVX__
+    weight_xc_data_packed.create(size, hidden_size / 2 + hidden_size % 2, num_directions, 32u, 8);
+    bias_c_data_packed.create(hidden_size, 1, num_directions, 16u, 4);
+    weight_hc_data_packed.create(num_output, hidden_size / 2 + hidden_size % 2, num_directions, 32u, 8);
+#else
+    weight_xc_data_packed.create(size, hidden_size, num_directions, 16u, 4);
+    bias_c_data_packed.create(hidden_size, 1, num_directions, 16u, 4);
+    weight_hc_data_packed.create(num_output, hidden_size, num_directions, 16u, 4);
+#endif
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int dr = 0; dr < num_directions; dr++)
+    {
+        const Mat weight_xc = weight_xc_data.channel(dr);
+        const Mat bias_c = bias_c_data.channel(dr);
+        const Mat weight_hc = weight_hc_data.channel(dr);
+
+        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
+        Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr);
+        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);
+
+        const float* bias_c_I = bias_c.row(0);
+        const float* bias_c_F = bias_c.row(1);
+        const float* bias_c_O = bias_c.row(2);
+        const float* bias_c_G = bias_c.row(3);
+
+        float* bias_c_IFOG = bias_c_data_packed_dr.row(0);
+
+        int q = 0;
+#if __AVX__
+        for (; q + 1 < hidden_size; q += 2)
+        {
+            bias_c_IFOG[0] = bias_c_I[q];
+            bias_c_IFOG[1] = bias_c_F[q];
+            bias_c_IFOG[2] = bias_c_O[q];
+            bias_c_IFOG[3] = bias_c_G[q];
+            bias_c_IFOG[4] = bias_c_I[q + 1];
+            bias_c_IFOG[5] = bias_c_F[q + 1];
+            bias_c_IFOG[6] = bias_c_O[q + 1];
+            bias_c_IFOG[7] = bias_c_G[q + 1];
+
+            bias_c_IFOG += 8;
+
+            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
+            const float* weight_xc_I_1 = weight_xc.row(hidden_size * 0 + q + 1);
+            const float* weight_xc_F_1 = weight_xc.row(hidden_size * 1 + q + 1);
+            const float* weight_xc_O_1 = weight_xc.row(hidden_size * 2 + q + 1);
+            const float* weight_xc_G_1 = weight_xc.row(hidden_size * 3 + q + 1);
+
+            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
+            const float* weight_hc_I_1 = weight_hc.row(hidden_size * 0 + q + 1);
+            const float* weight_hc_F_1 = weight_hc.row(hidden_size * 1 + q + 1);
+            const float* weight_hc_O_1 = weight_hc.row(hidden_size * 2 + q + 1);
+            const float* weight_hc_G_1 = weight_hc.row(hidden_size * 3 + q + 1);
+
+            float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q / 2);
+            float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q / 2);
+
+            for (int i = 0; i < size; i++)
+            {
+                weight_xc_IFOG[0] = weight_xc_I[i];
+                weight_xc_IFOG[1] = weight_xc_F[i];
+                weight_xc_IFOG[2] = weight_xc_O[i];
+                weight_xc_IFOG[3] = weight_xc_G[i];
+                weight_xc_IFOG[4] = weight_xc_I_1[i];
+                weight_xc_IFOG[5] = weight_xc_F_1[i];
+                weight_xc_IFOG[6] = weight_xc_O_1[i];
+                weight_xc_IFOG[7] = weight_xc_G_1[i];
+
+                weight_xc_IFOG += 8;
+            }
+
+            for (int i = 0; i < num_output; i++)
+            {
+                weight_hc_IFOG[0] = weight_hc_I[i];
+                weight_hc_IFOG[1] = weight_hc_F[i];
+                weight_hc_IFOG[2] = weight_hc_O[i];
+                weight_hc_IFOG[3] = weight_hc_G[i];
+                weight_hc_IFOG[4] = weight_hc_I_1[i];
+                weight_hc_IFOG[5] = weight_hc_F_1[i];
+                weight_hc_IFOG[6] = weight_hc_O_1[i];
+                weight_hc_IFOG[7] = weight_hc_G_1[i];
+
+                weight_hc_IFOG += 8;
+            }
+        }
+#endif // __AVX__
+        for (; q < hidden_size; q++)
+        {
+            bias_c_IFOG[0] = bias_c_I[q];
+            bias_c_IFOG[1] = bias_c_F[q];
+            bias_c_IFOG[2] = bias_c_O[q];
+            bias_c_IFOG[3] = bias_c_G[q];
+
+            bias_c_IFOG += 4;
+
+            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
+            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
+            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
+            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
+
+            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
+            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
+            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
+            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
+
+#if __AVX__
+            float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q / 2 + q % 2);
+            float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q / 2 + q % 2);
+#else
+            float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q);
+            float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q);
+#endif
+
+            for (int i = 0; i < size; i++)
+            {
+                weight_xc_IFOG[0] = weight_xc_I[i];
+                weight_xc_IFOG[1] = weight_xc_F[i];
+                weight_xc_IFOG[2] = weight_xc_O[i];
+                weight_xc_IFOG[3] = weight_xc_G[i];
+
+                weight_xc_IFOG += 4;
+            }
+
+            for (int i = 0; i < num_output; i++)
+            {
+                weight_hc_IFOG[0] = weight_hc_I[i];
+                weight_hc_IFOG[1] = weight_hc_F[i];
+                weight_hc_IFOG[2] = weight_hc_O[i];
+                weight_hc_IFOG[3] = weight_hc_G[i];
+
+                weight_hc_IFOG += 4;
+            }
+        }
+    }
+
+    if (opt.lightmode)
+    {
+        weight_xc_data.release();
+        bias_c_data.release();
+        weight_hc_data.release();
+    }
 
     return 0;
 }
-#ifdef __AVX__
-static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt)
+
+static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
 {
     int size = bottom_blob.w;
     int T = bottom_blob.h;
 
     int num_output = top_blob.w;
+    int hidden_size = cell_state.w;
 
-    // 4 x num_output
-    Mat gates(num_output, 4, 4u, opt.workspace_allocator);
+    // 4 x hidden_size
+    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
     if (gates.empty())
         return -100;
 
+    Mat tmp_hidden_state;
+    if (num_output != hidden_size)
+    {
+        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
+        if (tmp_hidden_state.empty())
+            return -100;
+    }
+
     // unroll
     for (int t = 0; t < T; t++)
     {
@@ -59,267 +226,222 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
 
         int ti = reverse ? T - 1 - t : t;
 
-        int nn_num_output = num_output >> 1;
-        int remain_num_output_start = nn_num_output << 1;
+#if __AVX__
+        int nn_hidden_size = hidden_size >> 1;
+        int remain_hidden_size_start = nn_hidden_size << 1;
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int qq = 0; qq < nn_num_output; qq++)
+        for (int qq = 0; qq < nn_hidden_size; qq++)
         {
             int q = qq * 2;
 
-            const float* x = bottom_blob.row(ti);
-            const float* hidden_ptr_r = hidden_state;
-            const float* bias_c_I = bias_c.row(0);
-            const float* bias_c_F = bias_c.row(1);
-            const float* bias_c_O = bias_c.row(2);
-            const float* bias_c_G = bias_c.row(3);
-
-            float* gates_data_I = gates.row(0);
-            float* gates_data_F = gates.row(1);
-            float* gates_data_O = gates.row(2);
-            float* gates_data_G = gates.row(3);
+            const float* bias_c_IFOG = (const float*)bias_c + q * 4;
+
             // gate I F O G
-            const float* weight_xc_I_0 = weight_xc.row(num_output * 0 + q);
-            const float* weight_xc_F_0 = weight_xc.row(num_output * 1 + q);
-            const float* weight_xc_O_0 = weight_xc.row(num_output * 2 + q);
-            const float* weight_xc_G_0 = weight_xc.row(num_output * 3 + q);
-            const float* weight_xc_I_1 = weight_xc.row(num_output * 0 + (q + 1));
-            const float* weight_xc_F_1 = weight_xc.row(num_output * 1 + (q + 1));
-            const float* weight_xc_O_1 = weight_xc.row(num_output * 2 + (q + 1));
-            const float* weight_xc_G_1 = weight_xc.row(num_output * 3 + (q + 1));
-
-            const float* weight_hc_I_0 = weight_hc.row(num_output * 0 + q);
-            const float* weight_hc_F_0 = weight_hc.row(num_output * 1 + q);
-            const float* weight_hc_O_0 = weight_hc.row(num_output * 2 + q);
-            const float* weight_hc_G_0 = weight_hc.row(num_output * 3 + q);
-            const float* weight_hc_I_1 = weight_hc.row(num_output * 0 + (q + 1));
-            const float* weight_hc_F_1 = weight_hc.row(num_output * 1 + (q + 1));
-            const float* weight_hc_O_1 = weight_hc.row(num_output * 2 + (q + 1));
-            const float* weight_hc_G_1 = weight_hc.row(num_output * 3 + (q + 1));
-
-            // float I = bias_c_I[q];
-            // float F = bias_c_F[q];
-            // float O = bias_c_O[q];
-            // float G = bias_c_G[q];
-            __m256 _sumI_0 = _mm256_setzero_ps();
-            __m256 _sumF_0 = _mm256_setzero_ps();
-            __m256 _sumO_0 = _mm256_setzero_ps();
-            __m256 _sumG_0 = _mm256_setzero_ps();
-            __m256 _sumI_1 = _mm256_setzero_ps();
-            __m256 _sumF_1 = _mm256_setzero_ps();
-            __m256 _sumO_1 = _mm256_setzero_ps();
-            __m256 _sumG_1 = _mm256_setzero_ps();
-            int nn_num_size = size >> 3;
-            int remain_size = size & 7;
-            for (; nn_num_size > 0; nn_num_size--)
+            const float* weight_xc_IFOG = weight_xc.row(q / 2);
+            const float* weight_hc_IFOG = weight_hc.row(q / 2);
+
+            __m256 _IFOG = _mm256_loadu_ps(bias_c_IFOG);
+            __m256 _sum1 = _mm256_setzero_ps();
+            __m256 _sum2 = _mm256_setzero_ps();
+            __m256 _sum3 = _mm256_setzero_ps();
+
+            const float* x = bottom_blob.row(ti);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
             {
-                __m256 xi = _mm256_loadu_ps(x);
-                _sumI_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_I_0), xi, _sumI_0);
-                _sumF_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_F_0), xi, _sumF_0);
-                _sumO_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_O_0), xi, _sumO_0);
-                _sumG_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_G_0), xi, _sumG_0);
-                _sumI_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_I_1), xi, _sumI_1);
-                _sumF_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_F_1), xi, _sumF_1);
-                _sumO_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_O_1), xi, _sumO_1);
-                _sumG_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_G_1), xi, _sumG_1);
-                x += 8;
-                weight_xc_I_0 += 8;
-                weight_xc_F_0 += 8;
-                weight_xc_O_0 += 8;
-                weight_xc_G_0 += 8;
-                weight_xc_I_1 += 8;
-                weight_xc_F_1 += 8;
-                weight_xc_O_1 += 8;
-                weight_xc_G_1 += 8;
+                __m256 _xi0 = _mm256_broadcast_ss(x);
+                __m256 _xi1 = _mm256_broadcast_ss(x + 1);
+                __m256 _xi2 = _mm256_broadcast_ss(x + 2);
+                __m256 _xi3 = _mm256_broadcast_ss(x + 3);
+                __m256 _weight_xc_IFOG0 = _mm256_loadu_ps(weight_xc_IFOG);
+                __m256 _weight_xc_IFOG1 = _mm256_loadu_ps(weight_xc_IFOG + 8);
+                __m256 _weight_xc_IFOG2 = _mm256_loadu_ps(weight_xc_IFOG + 16);
+                __m256 _weight_xc_IFOG3 = _mm256_loadu_ps(weight_xc_IFOG + 24);
+                _IFOG = _mm256_comp_fmadd_ps(_weight_xc_IFOG0, _xi0, _IFOG);
+                _sum1 = _mm256_comp_fmadd_ps(_weight_xc_IFOG1, _xi1, _sum1);
+                _sum2 = _mm256_comp_fmadd_ps(_weight_xc_IFOG2, _xi2, _sum2);
+                _sum3 = _mm256_comp_fmadd_ps(_weight_xc_IFOG3, _xi3, _sum3);
+
+                x += 4;
+                weight_xc_IFOG += 32;
             }
-            int nn_num_output = num_output >> 3;
-            int remain_num_output = num_output & 7;
-            for (; nn_num_output > 0; nn_num_output--)
+            for (; i < size; i++)
             {
-                __m256 h_cont = _mm256_loadu_ps(hidden_ptr_r);
-
-                _sumI_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_I_0), h_cont, _sumI_0);
-                _sumF_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_F_0), h_cont, _sumF_0);
-                _sumO_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_O_0), h_cont, _sumO_0);
-                _sumG_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_G_0), h_cont, _sumG_0);
-                _sumI_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_I_1), h_cont, _sumI_1);
-                _sumF_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_F_1), h_cont, _sumF_1);
-                _sumO_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_O_1), h_cont, _sumO_1);
-                _sumG_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_G_1), h_cont, _sumG_1);
-                hidden_ptr_r += 8;
-                weight_hc_I_0 += 8;
-                weight_hc_F_0 += 8;
-                weight_hc_O_0 += 8;
-                weight_hc_G_0 += 8;
-                weight_hc_I_1 += 8;
-                weight_hc_F_1 += 8;
-                weight_hc_O_1 += 8;
-                weight_hc_G_1 += 8;
+                __m256 _xi = _mm256_broadcast_ss(x);
+                __m256 _weight_xc_IFOG = _mm256_loadu_ps(weight_xc_IFOG);
+                _IFOG = _mm256_comp_fmadd_ps(_weight_xc_IFOG, _xi, _IFOG);
+
+                x += 1;
+                weight_xc_IFOG += 8;
             }
-            float sums[8];
-            _mm256_storeu_ps(sums, HorizontalSums(_sumI_0, _sumF_0, _sumO_0, _sumG_0, _sumI_1, _sumF_1, _sumO_1, _sumG_1));
-            sums[0] += bias_c_I[q];
-            sums[1] += bias_c_F[q];
-            sums[2] += bias_c_O[q];
-            sums[3] += bias_c_G[q];
-            sums[4] += bias_c_I[q + 1];
-            sums[5] += bias_c_F[q + 1];
-            sums[6] += bias_c_O[q + 1];
-            sums[7] += bias_c_G[q + 1];
-
-            for (; remain_size > 0; remain_size--)
+
+            const float* hidden_ptr = hidden_state;
+
+            i = 0;
+            for (; i + 3 < num_output; i += 4)
             {
-                float xi = *x;
-                sums[0] += *weight_xc_I_0 * xi;
-                sums[1] += *weight_xc_F_0 * xi;
-                sums[2] += *weight_xc_O_0 * xi;
-                sums[3] += *weight_xc_G_0 * xi;
-                sums[4] += *weight_xc_I_1 * xi;
-                sums[5] += *weight_xc_F_1 * xi;
-                sums[6] += *weight_xc_O_1 * xi;
-                sums[7] += *weight_xc_G_1 * xi;
-                x++;
-                weight_xc_I_0++;
-                weight_xc_F_0++;
-                weight_xc_O_0++;
-                weight_xc_G_0++;
-                weight_xc_I_1++;
-                weight_xc_F_1++;
-                weight_xc_O_1++;
-                weight_xc_G_1++;
+                __m256 _h_cont0 = _mm256_broadcast_ss(hidden_ptr);
+                __m256 _h_cont1 = _mm256_broadcast_ss(hidden_ptr + 1);
+                __m256 _h_cont2 = _mm256_broadcast_ss(hidden_ptr + 2);
+                __m256 _h_cont3 = _mm256_broadcast_ss(hidden_ptr + 3);
+                __m256 _weight_hc_IFOG0 = _mm256_loadu_ps(weight_hc_IFOG);
+                __m256 _weight_hc_IFOG1 = _mm256_loadu_ps(weight_hc_IFOG + 8);
+                __m256 _weight_hc_IFOG2 = _mm256_loadu_ps(weight_hc_IFOG + 16);
+                __m256 _weight_hc_IFOG3 = _mm256_loadu_ps(weight_hc_IFOG + 24);
+                _IFOG = _mm256_comp_fmadd_ps(_weight_hc_IFOG0, _h_cont0, _IFOG);
+                _sum1 = _mm256_comp_fmadd_ps(_weight_hc_IFOG1, _h_cont1, _sum1);
+                _sum2 = _mm256_comp_fmadd_ps(_weight_hc_IFOG2, _h_cont2, _sum2);
+                _sum3 = _mm256_comp_fmadd_ps(_weight_hc_IFOG3, _h_cont3, _sum3);
+
+                hidden_ptr += 4;
+                weight_hc_IFOG += 32;
             }
-
-            for (; remain_num_output > 0; remain_num_output--)
+            for (; i < num_output; i++)
             {
-                float h_cont = *hidden_ptr_r;
-                sums[0] += *weight_hc_I_0 * h_cont;
-                sums[1] += *weight_hc_F_0 * h_cont;
-                sums[2] += *weight_hc_O_0 * h_cont;
-                sums[3] += *weight_hc_G_0 * h_cont;
-                sums[4] += *weight_hc_I_1 * h_cont;
-                sums[5] += *weight_hc_F_1 * h_cont;
-                sums[6] += *weight_hc_O_1 * h_cont;
-                sums[7] += *weight_hc_G_1 * h_cont;
-                hidden_ptr_r++;
-                weight_hc_I_0++;
-                weight_hc_F_0++;
-                weight_hc_O_0++;
-                weight_hc_G_0++;
-                weight_hc_I_1++;
-                weight_hc_F_1++;
-                weight_hc_O_1++;
-                weight_hc_G_1++;
+                __m256 _h_cont = _mm256_broadcast_ss(hidden_ptr);
+                __m256 _weight_hc_IFOG = _mm256_loadu_ps(weight_hc_IFOG);
+                _IFOG = _mm256_comp_fmadd_ps(_weight_hc_IFOG, _h_cont, _IFOG);
+
+                hidden_ptr += 1;
+                weight_hc_IFOG += 8;
             }
-            gates_data_I[q] = sums[0];
-            gates_data_F[q] = sums[1];
-            gates_data_O[q] = sums[2];
-            gates_data_G[q] = sums[3];
-            gates_data_I[q + 1] = sums[4];
-            gates_data_F[q + 1] = sums[5];
-            gates_data_O[q + 1] = sums[6];
-            gates_data_G[q + 1] = sums[7];
+
+            float* gates_data = gates.row(q);
+
+            _IFOG = _mm256_add_ps(_IFOG, _sum1);
+            _sum2 = _mm256_add_ps(_sum2, _sum3);
+            _IFOG = _mm256_add_ps(_IFOG, _sum2);
+
+            _mm256_storeu_ps(gates_data, _IFOG);
         }
+#else
+        int nn_hidden_size = 0;
+        int remain_hidden_size_start = 0;
+#endif // __AVX__
+
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = remain_num_output_start; q < num_output; q++)
+        for (int q = remain_hidden_size_start; q < hidden_size; q++)
         {
-            const float* x = bottom_blob.row(ti);
-            const float* hidden_ptr_r = hidden_state;
-            const float* bias_c_I = bias_c.row(0);
-            const float* bias_c_F = bias_c.row(1);
-            const float* bias_c_O = bias_c.row(2);
-            const float* bias_c_G = bias_c.row(3);
-
-            float* gates_data_I = gates.row(0);
-            float* gates_data_F = gates.row(1);
-            float* gates_data_O = gates.row(2);
-            float* gates_data_G = gates.row(3);
+            const float* bias_c_IFOG = (const float*)bias_c + q * 4;
+
             // gate I F O G
-            const float* weight_xc_I = weight_xc.row(num_output * 0 + q);
-            const float* weight_xc_F = weight_xc.row(num_output * 1 + q);
-            const float* weight_xc_O = weight_xc.row(num_output * 2 + q);
-            const float* weight_xc_G = weight_xc.row(num_output * 3 + q);
-
-            const float* weight_hc_I = weight_hc.row(num_output * 0 + q);
-            const float* weight_hc_F = weight_hc.row(num_output * 1 + q);
-            const float* weight_hc_O = weight_hc.row(num_output * 2 + q);
-            const float* weight_hc_G = weight_hc.row(num_output * 3 + q);
-
-            // float I = bias_c_I[q];
-            // float F = bias_c_F[q];
-            // float O = bias_c_O[q];
-            // float G = bias_c_G[q];
-            __m256 _sumI = _mm256_setzero_ps();
-            __m256 _sumF = _mm256_setzero_ps();
-            __m256 _sumO = _mm256_setzero_ps();
-            __m256 _sumG = _mm256_setzero_ps();
-            int nn_num_size = size >> 3;
-            int remain_size = size & 7;
-            for (; nn_num_size > 0; nn_num_size--)
+#if __AVX__
+            const float* weight_xc_IFOG = weight_xc.row(q / 2 + q % 2);
+            const float* weight_hc_IFOG = weight_hc.row(q / 2 + q % 2);
+#else
+            const float* weight_xc_IFOG = weight_xc.row(q);
+            const float* weight_hc_IFOG = weight_hc.row(q);
+#endif
+
+#if __SSE2__
+            __m128 _IFOG = _mm_loadu_ps(bias_c_IFOG);
+            __m128 _sum1 = _mm_setzero_ps();
+            __m128 _sum2 = _mm_setzero_ps();
+            __m128 _sum3 = _mm_setzero_ps();
+#else  // __SSE2__
+            float I = bias_c_IFOG[0];
+            float F = bias_c_IFOG[1];
+            float O = bias_c_IFOG[2];
+            float G = bias_c_IFOG[3];
+#endif // __SSE2__
+
+            const float* x = bottom_blob.row(ti);
+
+            int i = 0;
+#if __SSE2__
+            for (; i + 3 < size; i += 4)
             {
-                __m256 xi = _mm256_loadu_ps(x);
-                _sumI = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_I), xi, _sumI);
-                _sumF = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_F), xi, _sumF);
-                _sumO = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_O), xi, _sumO);
-                _sumG = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_G), xi, _sumG);
-                x += 8;
-                weight_xc_I += 8;
-                weight_xc_F += 8;
-                weight_xc_O += 8;
-                weight_xc_G += 8;
+                __m128 _xi0 = _mm_load1_ps(x);
+                __m128 _xi1 = _mm_load1_ps(x + 1);
+                __m128 _xi2 = _mm_load1_ps(x + 2);
+                __m128 _xi3 = _mm_load1_ps(x + 3);
+                __m128 _weight_xc_IFOG0 = _mm_loadu_ps(weight_xc_IFOG);
+                __m128 _weight_xc_IFOG1 = _mm_loadu_ps(weight_xc_IFOG + 4);
+                __m128 _weight_xc_IFOG2 = _mm_loadu_ps(weight_xc_IFOG + 8);
+                __m128 _weight_xc_IFOG3 = _mm_loadu_ps(weight_xc_IFOG + 12);
+                _IFOG = _mm_comp_fmadd_ps(_weight_xc_IFOG0, _xi0, _IFOG);
+                _sum1 = _mm_comp_fmadd_ps(_weight_xc_IFOG1, _xi1, _sum1);
+                _sum2 = _mm_comp_fmadd_ps(_weight_xc_IFOG2, _xi2, _sum2);
+                _sum3 = _mm_comp_fmadd_ps(_weight_xc_IFOG3, _xi3, _sum3);
+
+                x += 4;
+                weight_xc_IFOG += 16;
             }
-            int nn_num_output = num_output >> 3;
-            int remain_num_output = num_output & 7;
-            for (; nn_num_output > 0; nn_num_output--)
+#endif // __SSE2__
+            for (; i < size; i++)
             {
-                __m256 h_cont = _mm256_loadu_ps(hidden_ptr_r);
-
-                _sumI = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_I), h_cont, _sumI);
-                _sumF = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_F), h_cont, _sumF);
-                _sumO = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_O), h_cont, _sumO);
-                _sumG = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_G), h_cont, _sumG);
-                hidden_ptr_r += 8;
-                weight_hc_I += 8;
-                weight_hc_F += 8;
-                weight_hc_O += 8;
-                weight_hc_G += 8;
+#if __SSE2__
+                __m128 _xi = _mm_load1_ps(x);
+                __m128 _weight_xc_IFOG = _mm_loadu_ps(weight_xc_IFOG);
+                _IFOG = _mm_comp_fmadd_ps(_weight_xc_IFOG, _xi, _IFOG);
+#else  // __SSE2__
+                float xi = x[0];
+                I += xi * weight_xc_IFOG[0];
+                F += xi * weight_xc_IFOG[1];
+                O += xi * weight_xc_IFOG[2];
+                G += xi * weight_xc_IFOG[3];
+#endif // __SSE2__
+
+                x += 1;
+                weight_xc_IFOG += 4;
             }
-            float sums[4];
-            _mm_storeu_ps(sums, HorizontalSums(_sumI, _sumF, _sumO, _sumG));
-            sums[0] += bias_c_I[q];
-            sums[1] += bias_c_F[q];
-            sums[2] += bias_c_O[q];
-            sums[3] += bias_c_G[q];
-
-            for (; remain_size > 0; remain_size--)
+
+            const float* hidden_ptr = hidden_state;
+
+            i = 0;
+#if __SSE2__
+            for (; i + 3 < num_output; i += 4)
             {
-                float xi = *x;
-                sums[0] += *weight_xc_I * xi;
-                sums[1] += *weight_xc_F * xi;
-                sums[2] += *weight_xc_O * xi;
-                sums[3] += *weight_xc_G * xi;
-                x++;
-                weight_xc_I++;
-                weight_xc_F++;
-                weight_xc_O++;
-                weight_xc_G++;
+                __m128 _h_cont0 = _mm_load1_ps(hidden_ptr);
+                __m128 _h_cont1 = _mm_load1_ps(hidden_ptr + 1);
+                __m128 _h_cont2 = _mm_load1_ps(hidden_ptr + 2);
+                __m128 _h_cont3 = _mm_load1_ps(hidden_ptr + 3);
+                __m128 _weight_hc_IFOG0 = _mm_loadu_ps(weight_hc_IFOG);
+                __m128 _weight_hc_IFOG1 = _mm_loadu_ps(weight_hc_IFOG + 4);
+                __m128 _weight_hc_IFOG2 = _mm_loadu_ps(weight_hc_IFOG + 8);
+                __m128 _weight_hc_IFOG3 = _mm_loadu_ps(weight_hc_IFOG + 12);
+                _IFOG = _mm_comp_fmadd_ps(_weight_hc_IFOG0, _h_cont0, _IFOG);
+                _sum1 = _mm_comp_fmadd_ps(_weight_hc_IFOG1, _h_cont1, _sum1);
+                _sum2 = _mm_comp_fmadd_ps(_weight_hc_IFOG2, _h_cont2, _sum2);
+                _sum3 = _mm_comp_fmadd_ps(_weight_hc_IFOG3, _h_cont3, _sum3);
+
+                hidden_ptr += 4;
+                weight_hc_IFOG += 16;
             }
-
-            for (; remain_num_output > 0; remain_num_output--)
+#endif // __SSE2__
+            for (; i < num_output; i++)
             {
-                float h_cont = *hidden_ptr_r;
-                sums[0] += *weight_hc_I * h_cont;
-                sums[1] += *weight_hc_F * h_cont;
-                sums[2] += *weight_hc_O * h_cont;
-                sums[3] += *weight_hc_G * h_cont;
-                hidden_ptr_r++;
-                weight_hc_I++;
-                weight_hc_F++;
-                weight_hc_O++;
-                weight_hc_G++;
+#if __SSE2__
+                __m128 _h_cont = _mm_load1_ps(hidden_ptr);
+                __m128 _weight_hc_IFOG = _mm_loadu_ps(weight_hc_IFOG);
+                _IFOG = _mm_comp_fmadd_ps(_weight_hc_IFOG, _h_cont, _IFOG);
+#else  // __SSE2__
+                float h_cont = hidden_ptr[0];
+                I += h_cont * weight_hc_IFOG[0];
+                F += h_cont * weight_hc_IFOG[1];
+                O += h_cont * weight_hc_IFOG[2];
+                G += h_cont * weight_hc_IFOG[3];
+#endif // __SSE2__
+
+                hidden_ptr += 1;
+                weight_hc_IFOG += 4;
             }
-            gates_data_I[q] = sums[0];
-            gates_data_F[q] = sums[1];
-            gates_data_O[q] = sums[2];
-            gates_data_G[q] = sums[3];
+
+            float* gates_data = gates.row(q);
+
+#if __SSE2__
+            _IFOG = _mm_add_ps(_IFOG, _sum1);
+            _sum2 = _mm_add_ps(_sum2, _sum3);
+            _IFOG = _mm_add_ps(_IFOG, _sum2);
+
+            _mm_storeu_ps(gates_data, _IFOG);
+#else  // __SSE2__
+            gates_data[0] = I;
+            gates_data[1] = F;
+            gates_data[2] = O;
+            gates_data[3] = G;
+#endif // __SSE2__
         }
 
         // lstm unit
@@ -330,69 +452,117 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w
         // c_t := f_t .* c_{t-1} + i_t .* g_t
         // h_t := o_t .* tanh[c_t]
         float* output_data = top_blob.row(ti);
+
         float* cell_ptr = cell_state;
         float* hidden_ptr = hidden_state;
-        const float* gates_data_I = gates.row(0);
-        const float* gates_data_F = gates.row(1);
-        const float* gates_data_O = gates.row(2);
-        const float* gates_data_G = gates.row(3);
-        int nn_activation = num_output >> 3;
-        int remain_activations = num_output & 7;
-        for (; nn_activation > 0; nn_activation--)
+        float* tmp_hidden_ptr = tmp_hidden_state;
+
+#if __SSE2__
+        nn_hidden_size = hidden_size >> 2;
+        remain_hidden_size_start = nn_hidden_size << 2;
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int qq = 0; qq < nn_hidden_size; qq++)
         {
-            __m256 I = sigmoid_avx(_mm256_loadu_ps(gates_data_I));
-            __m256 F = sigmoid_avx(_mm256_loadu_ps(gates_data_F));
-            __m256 O = sigmoid_avx(_mm256_loadu_ps(gates_data_O));
-            __m256 G = tanh_avx(_mm256_loadu_ps(gates_data_G));
-            __m256 cell2 = _mm256_add_ps(_mm256_mul_ps(F, _mm256_loadu_ps(cell_ptr)), _mm256_mul_ps(I, G));
-            __m256 H = _mm256_mul_ps(O, tanh_avx(cell2));
-            _mm256_storeu_ps(cell_ptr, cell2);
-            _mm256_storeu_ps(hidden_ptr, H);
-            _mm256_storeu_ps(output_data, H);
-            cell_ptr += 8;
-            output_data += 8;
-            hidden_ptr += 8;
-            gates_data_I += 8;
-            gates_data_F += 8;
-            gates_data_O += 8;
-            gates_data_G += 8;
+            int q = qq * 4;
+
+            const float* gates_data = gates.row(q);
+
+            __m128 _IFOG_4x4_0 = _mm_loadu_ps(gates_data);
+            __m128 _IFOG_4x4_1 = _mm_loadu_ps(gates_data + 4);
+            __m128 _IFOG_4x4_2 = _mm_loadu_ps(gates_data + 8);
+            __m128 _IFOG_4x4_3 = _mm_loadu_ps(gates_data + 12);
+
+            _MM_TRANSPOSE4_PS(_IFOG_4x4_0, _IFOG_4x4_1, _IFOG_4x4_2, _IFOG_4x4_3);
+
+            __m128 _I = sigmoid_sse(_IFOG_4x4_0);
+            __m128 _F = sigmoid_sse(_IFOG_4x4_1);
+            __m128 _O = sigmoid_sse(_IFOG_4x4_2);
+            __m128 _G = tanh_sse(_IFOG_4x4_3);
+
+            __m128 _cell2 = _mm_add_ps(_mm_mul_ps(_F, _mm_loadu_ps(cell_ptr + q)), _mm_mul_ps(_I, _G));
+            __m128 _H = _mm_mul_ps(_O, tanh_sse(_cell2));
+
+            _mm_storeu_ps(cell_ptr + q, _cell2);
+
+            if (num_output == hidden_size)
+            {
+                _mm_storeu_ps(hidden_ptr + q, _H);
+                _mm_storeu_ps(output_data + q, _H);
+            }
+            else
+            {
+                _mm_storeu_ps(tmp_hidden_ptr + q, _H);
+            }
         }
-        for (; remain_activations > 0; remain_activations--)
+#else  // __SSE2__
+        remain_hidden_size_start = 0;
+#endif // __SSE2__
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = remain_hidden_size_start; q < hidden_size; q++)
         {
-            float I = *gates_data_I;
-            float F = *gates_data_F;
-            float O = *gates_data_O;
-            float G = *gates_data_G;
+            const float* gates_data = gates.row(q);
+
+            float I = gates_data[0];
+            float F = gates_data[1];
+            float O = gates_data[2];
+            float G = gates_data[3];
 
             I = 1.f / (1.f + exp(-I));
             F = 1.f / (1.f + exp(-F));
             O = 1.f / (1.f + exp(-O));
             G = tanh(G);
-            float cell2 = F * *cell_ptr + I * G;
+
+            float cell2 = F * cell_ptr[q] + I * G;
             float H = O * tanh(cell2);
-            *cell_ptr = cell2;
-            *hidden_ptr = H;
-            *output_data = H;
-            cell_ptr++;
-            output_data++;
-            hidden_ptr++;
-            gates_data_I++;
-            gates_data_F++;
-            gates_data_O++;
-            gates_data_G++;
+
+            cell_ptr[q] = cell2;
+            if (num_output == hidden_size)
+            {
+                hidden_ptr[q] = H;
+                output_data[q] = H;
+            }
+            else
+            {
+                tmp_hidden_ptr[q] = H;
+            }
         }
 
-        // no cell output here
+        if (num_output != hidden_size)
+        {
+            // int nn_num_output = num_output >> 2;
+            // int remain_num_output_start = nn_num_output << 2;
+            // #pragma omp parallel for num_threads(opt.num_threads)
+            // for (int qq = 0; qq < nn_num_output; qq++)
+            // {
+            //     int q = qq * 4;
+            //
+            // }
+            int remain_num_output_start = 0;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = remain_num_output_start; q < num_output; q++)
+            {
+                const float* hr = weight_hr.row(q);
+                const float* tmp_hidden_ptr = tmp_hidden_state;
+
+                float H = 0;
+                for (int i = 0; i < hidden_size; i++)
+                {
+                    H += tmp_hidden_ptr[i] * hr[i];
+                }
+
+                output_data[q] = H;
+                hidden_ptr[q] = H;
+            }
+        }
     }
 
     return 0;
 }
-#endif
 
 int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-#if __AVX__
     int T = bottom_blob.h;
+
     int num_directions = direction == 2 ? 2 : 1;
 
     // initial hidden state
@@ -400,8 +570,8 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     if (hidden.empty())
         return -100;
     hidden.fill(0.f);
-    // internal cell state
-    Mat cell(num_output, 4u, opt.workspace_allocator);
+
+    Mat cell(hidden_size, 4u, opt.workspace_allocator);
     if (cell.empty())
         return -100;
     cell.fill(0.f);
@@ -413,7 +583,7 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
+        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -428,14 +598,14 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         if (top_blob_reverse.empty())
             return -100;
 
-        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
+        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret0 != 0)
             return ret0;
 
         hidden.fill(0.0f);
         cell.fill(0.0f);
 
-        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, cell, opt);
+        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -452,14 +622,10 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
 
     return 0;
-#else
-    return LSTM::forward(bottom_blob, top_blob, opt);
-#endif
 }
 
 int LSTM_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
 {
-#if __AVX__
     const Mat& bottom_blob = bottom_blobs[0];
     int T = bottom_blob.h;
     int num_directions = direction == 2 ? 2 : 1;
@@ -479,7 +645,7 @@ int LSTM_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
             return -100;
         hidden.fill(0.f);
 
-        cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
+        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
         if (cell.empty())
             return -100;
         cell.fill(0.f);
@@ -493,7 +659,7 @@ int LSTM_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     // Uni directional
     if (direction == 0 || direction == 1)
     {
-        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
+        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
         if (ret != 0)
             return ret;
     }
@@ -510,15 +676,13 @@ int LSTM_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
 
         Mat hidden0 = hidden.row_range(0, 1);
         Mat cell0 = cell.row_range(0, 1);
-
-        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt);
+        int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
         if (ret0 != 0)
             return ret0;
 
         Mat hidden1 = hidden.row_range(1, 1);
         Mat cell1 = cell.row_range(1, 1);
-
-        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt);
+        int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
         if (ret1 != 0)
             return ret1;
 
@@ -541,9 +705,6 @@ int LSTM_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     }
 
     return 0;
-#else
-    return LSTM::forward(bottom_blobs, top_blobs, opt);
-#endif
 }
 
 } // namespace ncnn
diff --git a/src/layer/x86/lstm_x86.h b/src/layer/x86/lstm_x86.h
index 51ffb413916..cab7d7e32fa 100644
--- a/src/layer/x86/lstm_x86.h
+++ b/src/layer/x86/lstm_x86.h
@@ -31,6 +31,9 @@ class LSTM_x86 : virtual public LSTM
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 public:
+    Mat weight_xc_data_packed;
+    Mat bias_c_data_packed;
+    Mat weight_hc_data_packed;
 };
 
 } // namespace ncnn
diff --git a/src/layer/x86/packing_x86.cpp b/src/layer/x86/packing_x86.cpp
index 38d0a941d13..df120ee0726 100644
--- a/src/layer/x86/packing_x86.cpp
+++ b/src/layer/x86/packing_x86.cpp
@@ -235,7 +235,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row5 = _mm256_loadu_ps(r5);
                     __m256 _row6 = _mm256_loadu_ps(r6);
                     __m256 _row7 = _mm256_loadu_ps(r7);
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
                     _mm256_storeu_ps(outptr, _row0);
                     _mm256_storeu_ps(outptr + 8, _row1);
                     _mm256_storeu_ps(outptr + 16, _row2);
@@ -298,7 +298,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row5 = _mm256_loadu_ps(r0 + 40);
                     __m256 _row6 = _mm256_loadu_ps(r0 + 48);
                     __m256 _row7 = _mm256_loadu_ps(r0 + 56);
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
                     _mm256_storeu_ps(outptr0, _row0);
                     _mm256_storeu_ps(outptr1, _row1);
                     _mm256_storeu_ps(outptr2, _row2);
@@ -432,7 +432,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _rd = _mm512_loadu_ps(rd);
                     __m512 _re = _mm512_loadu_ps(re);
                     __m512 _rf = _mm512_loadu_ps(rf);
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
                     _mm512_storeu_ps(outptr, _r0);
                     _mm512_storeu_ps(outptr + 16, _r1);
                     _mm512_storeu_ps(outptr + 16 * 2, _r2);
@@ -535,7 +535,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _rd = _mm512_loadu_ps(r0 + 16 * 13);
                     __m512 _re = _mm512_loadu_ps(r0 + 16 * 14);
                     __m512 _rf = _mm512_loadu_ps(r0 + 16 * 15);
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
                     _mm512_storeu_ps(outptr0, _r0);
                     _mm512_storeu_ps(outptr1, _r1);
                     _mm512_storeu_ps(outptr2, _r2);
@@ -882,7 +882,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row5 = _mm256_loadu_ps(r5);
                     __m256 _row6 = _mm256_loadu_ps(r6);
                     __m256 _row7 = _mm256_loadu_ps(r7);
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
                     _mm256_storeu_ps(outptr, _row0);
                     _mm256_storeu_ps(outptr + 8, _row1);
                     _mm256_storeu_ps(outptr + 16, _row2);
@@ -945,7 +945,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row5 = _mm256_loadu_ps(r0 + 40);
                     __m256 _row6 = _mm256_loadu_ps(r0 + 48);
                     __m256 _row7 = _mm256_loadu_ps(r0 + 56);
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
                     _mm256_storeu_ps(outptr0, _row0);
                     _mm256_storeu_ps(outptr1, _row1);
                     _mm256_storeu_ps(outptr2, _row2);
@@ -1079,7 +1079,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _rd = _mm512_loadu_ps(rd);
                     __m512 _re = _mm512_loadu_ps(re);
                     __m512 _rf = _mm512_loadu_ps(rf);
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
                     _mm512_storeu_ps(outptr, _r0);
                     _mm512_storeu_ps(outptr + 16, _r1);
                     _mm512_storeu_ps(outptr + 16 * 2, _r2);
@@ -1182,7 +1182,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _rd = _mm512_loadu_ps(r0 + 16 * 13);
                     __m512 _re = _mm512_loadu_ps(r0 + 16 * 14);
                     __m512 _rf = _mm512_loadu_ps(r0 + 16 * 15);
-                    transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
+                    transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
                     _mm512_storeu_ps(outptr0, _r0);
                     _mm512_storeu_ps(outptr1, _r1);
                     _mm512_storeu_ps(outptr2, _r2);
diff --git a/src/layer/x86/prelu_x86.cpp b/src/layer/x86/prelu_x86.cpp
index 240cfd849dc..52334659e26 100644
--- a/src/layer/x86/prelu_x86.cpp
+++ b/src/layer/x86/prelu_x86.cpp
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -14,6 +14,12 @@
 
 #include "prelu_x86.h"
 
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif // __AVX__
+#endif // __SSE2__
 #include "x86_activation.h"
 
 namespace ncnn {
@@ -28,219 +34,210 @@ PReLU_x86::PReLU_x86()
 int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     int dims = bottom_top_blob.dims;
-#if __SSE2__
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
     int elempack = bottom_top_blob.elempack;
 
-#if __AVX__
-#if __AVX512F__
-    if (elempack == 16)
+    if (dims == 1)
     {
-        Mat tmp;
-        convert_packing(bottom_top_blob, tmp, 8, opt);
-
-        forward_inplace(tmp, opt);
-
-        convert_packing(tmp, bottom_top_blob, 16, opt);
-
-        return 0;
-    }
-#endif // __AVX512F__
+        const int size = w * elempack;
 
-    if (elempack == 8)
-    {
-        if (dims == 1)
+        if (num_slope > 1)
         {
-            int w = bottom_top_blob.w;
+            float* ptr = bottom_top_blob;
+            const float* slope = slope_data;
 
-            if (num_slope > 1)
+            int nn_size = 0;
+            int remain_size_start = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+            nn_size = (size - remain_size_start) / 16;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int ii = 0; ii < nn_size; ii++)
             {
-                const float* slope = slope_data;
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    float* ptr = (float*)bottom_top_blob + i * 8;
-                    __m256 _p = _mm256_loadu_ps(ptr);
-                    __m256 _slope = _mm256_loadu_ps(slope + i * 8);
-                    _mm256_storeu_ps(ptr, prelu_avx(_p, _slope));
-                }
+                int i = remain_size_start + ii * 16;
+                __m512 _p512 = _mm512_loadu_ps(ptr + i);
+                __m512 _slope512 = _mm512_loadu_ps(slope + i);
+                _mm512_storeu_ps(ptr + i, prelu_avx512(_p512, _slope512));
             }
-            else
+            remain_size_start += nn_size * 16;
+#endif // __AVX512F__
+            nn_size = (size - remain_size_start) / 8;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int ii = 0; ii < nn_size; ii++)
             {
-                __m256 _slope = _mm256_set1_ps(slope_data[0]);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    float* ptr = (float*)bottom_top_blob + i * 8;
-                    __m256 _p = _mm256_loadu_ps(ptr);
-                    _mm256_storeu_ps(ptr, prelu_avx(_p, _slope));
-                }
+                int i = remain_size_start + ii * 8;
+                __m256 _p256 = _mm256_loadu_ps(ptr + i);
+                __m256 _slope256 = _mm256_loadu_ps(slope + i);
+                _mm256_storeu_ps(ptr + i, prelu_avx(_p256, _slope256));
             }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_top_blob.w;
-            int h = bottom_top_blob.h;
-
+            remain_size_start += nn_size * 8;
+#endif // __AVX__
+            nn_size = (size - remain_size_start) / 4;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+            for (int ii = 0; ii < nn_size; ii++)
             {
-                float* ptr = bottom_top_blob.row(i);
-                __m256 _slope = num_slope > 1 ? _mm256_loadu_ps((const float*)slope_data + i * 8) : _mm256_set1_ps(slope_data[0]);
-
-                for (int j = 0; j < w; j++)
-                {
-                    __m256 _p = _mm256_loadu_ps(ptr);
-                    _mm256_storeu_ps(ptr, prelu_avx(_p, _slope));
-                    ptr += 8;
-                }
+                int i = remain_size_start + ii * 4;
+                __m128 _p128 = _mm_load_ps(ptr + i);
+                __m128 _slope128 = _mm_load_ps(slope + i);
+                _mm_store_ps(ptr + i, prelu_sse(_p128, _slope128));
+            }
+            remain_size_start += nn_size * 4;
+#endif // __SSE2__
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = remain_size_start; i < size; i++)
+            {
+                if (ptr[i] < 0)
+                    ptr[i] *= slope_data[i];
             }
         }
-
-        if (dims == 3)
+        else
         {
-            int w = bottom_top_blob.w;
-            int h = bottom_top_blob.h;
-            int channels = bottom_top_blob.c;
-            int size = w * h;
+            float* ptr = bottom_top_blob;
+            const float slope = slope_data[0];
 
+            int nn_size = 0;
+            int remain_size_start = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+            nn_size = (size - remain_size_start) / 16;
             #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+            for (int ii = 0; ii < nn_size; ii++)
             {
-                float* ptr = bottom_top_blob.channel(q);
-                __m256 _slope = num_slope > 1 ? _mm256_loadu_ps((const float*)slope_data + q * 8) : _mm256_set1_ps(slope_data[0]);
-
-                for (int i = 0; i < size; i++)
-                {
-                    __m256 _p = _mm256_loadu_ps(ptr);
-                    _mm256_storeu_ps(ptr, prelu_avx(_p, _slope));
-                    ptr += 8;
-                }
+                int i = remain_size_start + ii * 16;
+                __m512 _p512 = _mm512_loadu_ps(ptr + i);
+                __m512 _slope512 = _mm512_set1_ps(slope);
+                _mm512_storeu_ps(ptr + i, prelu_avx512(_p512, _slope512));
+            }
+            remain_size_start += nn_size * 16;
+#endif // __AVX512F__
+            nn_size = (size - remain_size_start) / 8;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int ii = 0; ii < nn_size; ii++)
+            {
+                int i = remain_size_start + ii * 8;
+                __m256 _p256 = _mm256_loadu_ps(ptr + i);
+                __m256 _slope256 = _mm256_set1_ps(slope);
+                _mm256_storeu_ps(ptr + i, prelu_avx(_p256, _slope256));
+            }
+            remain_size_start += nn_size * 8;
+#endif // __AVX__
+            nn_size = (size - remain_size_start) / 4;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int ii = 0; ii < nn_size; ii++)
+            {
+                int i = remain_size_start + ii * 4;
+                __m128 _p128 = _mm_load_ps(ptr + i);
+                __m128 _slope128 = _mm_set1_ps(slope);
+                _mm_store_ps(ptr + i, prelu_sse(_p128, _slope128));
+            }
+            remain_size_start += nn_size * 4;
+#endif // __SSE2__
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = remain_size_start; i < size; i++)
+            {
+                if (ptr[i] < 0)
+                    ptr[i] *= slope;
             }
         }
-
-        return 0;
     }
-#endif // __AVX__
 
-    if (elempack == 4)
+    if (dims == 2)
     {
-        if (dims == 1)
+        const int size = w * elempack;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
         {
-            int w = bottom_top_blob.w;
+            float* ptr = bottom_top_blob.row(i);
+            int j = 0;
 
-            if (num_slope > 1)
-            {
-                const float* slope = slope_data;
+            float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
+#if __SSE2__
+            __m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope);
+#if __AVX__
+            __m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + i * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1);
+#if __AVX512F__
+            __m512 _slope512 = num_slope > 1 && (elempack == 16) ? _mm512_loadu_ps((const float*)slope_data + i * 16) : _mm512_insertf32x8(_mm512_castps256_ps512(_slope256), _slope256, 1);
 
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    float* ptr = (float*)bottom_top_blob + i * 4;
-                    __m128 _p = _mm_loadu_ps(ptr);
-                    __m128 _slope = _mm_loadu_ps(slope + i * 4);
-                    _mm_storeu_ps(ptr, prelu_sse(_p, _slope));
-                }
+            for (; j + 15 < size; j += 16)
+            {
+                __m512 _p512 = _mm512_loadu_ps(ptr);
+                _mm512_storeu_ps(ptr, prelu_avx512(_p512, _slope512));
+                ptr += 16;
             }
-            else
+#endif // __AVX512F__
+            for (; j + 7 < size; j += 8)
             {
-                __m128 _slope = _mm_set1_ps(slope_data[0]);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int i = 0; i < w; i++)
-                {
-                    float* ptr = (float*)bottom_top_blob + i * 4;
-                    __m128 _p = _mm_loadu_ps(ptr);
-                    _mm_storeu_ps(ptr, prelu_sse(_p, _slope));
-                }
+                __m256 _p256 = _mm256_loadu_ps(ptr);
+                _mm256_storeu_ps(ptr, prelu_avx(_p256, _slope256));
+                ptr += 8;
             }
-        }
-
-        if (dims == 2)
-        {
-            int w = bottom_top_blob.w;
-            int h = bottom_top_blob.h;
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int i = 0; i < h; i++)
+#endif // __AVX__
+            for (; j + 3 < size; j += 4)
             {
-                float* ptr = bottom_top_blob.row(i);
-                __m128 _slope = num_slope > 1 ? _mm_loadu_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope_data[0]);
-
-                for (int j = 0; j < w; j++)
-                {
-                    __m128 _p = _mm_loadu_ps(ptr);
-                    _mm_storeu_ps(ptr, prelu_sse(_p, _slope));
-                    ptr += 4;
-                }
+                __m128 _p128 = _mm_loadu_ps(ptr);
+                _mm_storeu_ps(ptr, prelu_sse(_p128, _slope128));
+                ptr += 4;
             }
-        }
-
-        if (dims == 3)
-        {
-            int w = bottom_top_blob.w;
-            int h = bottom_top_blob.h;
-            int channels = bottom_top_blob.c;
-            int size = w * h;
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
+#endif // __SSE2__
+            for (; j < size; j++)
             {
-                float* ptr = bottom_top_blob.channel(q);
-                __m128 _slope = num_slope > 1 ? _mm_loadu_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope_data[0]);
-
-                for (int i = 0; i < size; i++)
-                {
-                    __m128 _p = _mm_loadu_ps(ptr);
-                    _mm_storeu_ps(ptr, prelu_sse(_p, _slope));
-                    ptr += 4;
-                }
+                if (*ptr < 0)
+                    *ptr *= slope;
+                ptr++;
             }
         }
-
-        return 0;
     }
-#endif // __SSE2__
-
-    if (dims != 3)
-        return PReLU::forward_inplace(bottom_top_blob, opt);
 
-    int w = bottom_top_blob.w;
-    int h = bottom_top_blob.h;
-    int channels = bottom_top_blob.c;
-    int size = w * h;
-
-    const float* slope_data_ptr = slope_data;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+    if (dims == 3)
     {
-        float* ptr = bottom_top_blob.channel(q);
-        float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+        const int size = w * h * elempack;
 
-#if __AVX__
-        int nn = size >> 3;
-        int remain = size - (nn << 3);
-#else
-        int remain = size;
-#endif // __AVX__
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            int i = 0;
 
+            float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
+#if __SSE2__
+            __m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope);
 #if __AVX__
-        for (; nn > 0; nn--)
-        {
-            __m256 _p = _mm256_loadu_ps(ptr);
-            _mm256_storeu_ps(ptr, prelu_avx(_p, _mm256_set1_ps(slope)));
-            ptr += 8;
-        }
-#endif // __AVX__
-        for (; remain > 0; remain--)
-        {
-            if (*ptr < 0)
-                *ptr *= slope;
+            __m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + q * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1);
+#if __AVX512F__
+            __m512 _slope512 = num_slope > 1 && (elempack == 16) ? _mm512_loadu_ps((const float*)slope_data + q * 16) : _mm512_insertf32x8(_mm512_castps256_ps512(_slope256), _slope256, 1);
 
-            ptr++;
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p512 = _mm512_loadu_ps(ptr);
+                _mm512_storeu_ps(ptr, prelu_avx512(_p512, _slope512));
+                ptr += 16;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p256 = _mm256_loadu_ps(ptr);
+                _mm256_storeu_ps(ptr, prelu_avx(_p256, _slope256));
+                ptr += 8;
+            }
+#endif // __AVX__
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p128 = _mm_load_ps(ptr);
+                _mm_store_ps(ptr, prelu_sse(_p128, _slope128));
+                ptr += 4;
+            }
+#endif // __SSE2__
+            for (; i < size; i++)
+            {
+                if (*ptr < 0)
+                    *ptr *= slope;
+                ptr++;
+            }
         }
     }
 
diff --git a/src/layer/x86/prelu_x86.h b/src/layer/x86/prelu_x86.h
index d6d0e4509be..6bbfeae0f0d 100644
--- a/src/layer/x86/prelu_x86.h
+++ b/src/layer/x86/prelu_x86.h
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
diff --git a/src/layer/x86/reshape_x86.cpp b/src/layer/x86/reshape_x86.cpp
index 7d1c89b359c..ab45ca647ad 100644
--- a/src/layer/x86/reshape_x86.cpp
+++ b/src/layer/x86/reshape_x86.cpp
@@ -208,7 +208,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _rowe = _mm512_loadu_ps(ptre);
                     __m512 _rowf = _mm512_loadu_ps(ptrf);
 
-                    transpose16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf);
+                    transpose16x16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf);
 
                     _mm512_storeu_ps(outptr, _row0);
                     _mm512_storeu_ps(outptr + 16, _row1);
@@ -297,7 +297,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row6 = _mm256_loadu_ps(ptr6);
                     __m256 _row7 = _mm256_loadu_ps(ptr7);
 
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
 
                     _mm256_storeu_ps(outptr, _row0);
                     _mm256_storeu_ps(outptr + 8, _row1);
@@ -526,7 +526,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m512 _rowe = _mm512_loadu_ps(ptre);
                     __m512 _rowf = _mm512_loadu_ps(ptrf);
 
-                    transpose16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf);
+                    transpose16x16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf);
 
                     _mm512_storeu_ps(outptr, _row0);
                     _mm512_storeu_ps(outptr + 16, _row1);
@@ -615,7 +615,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
                     __m256 _row6 = _mm256_loadu_ps(ptr6);
                     __m256 _row7 = _mm256_loadu_ps(ptr7);
 
-                    transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
+                    transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
 
                     _mm256_storeu_ps(outptr, _row0);
                     _mm256_storeu_ps(outptr + 8, _row1);
diff --git a/src/layer/x86/softmax_x86.cpp b/src/layer/x86/softmax_x86.cpp
index d1df7e446cf..3a658a9a4bc 100644
--- a/src/layer/x86/softmax_x86.cpp
+++ b/src/layer/x86/softmax_x86.cpp
@@ -125,7 +125,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     __m512 _pd = _mm512_load_ps(ptr + 16 * 13);
                     __m512 _pe = _mm512_load_ps(ptr + 16 * 14);
                     __m512 _pf = _mm512_load_ps(ptr + 16 * 15);
-                    transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
+                    transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
                     __m512 _max01 = _mm512_max_ps(_p0, _p1);
                     __m512 _max23 = _mm512_max_ps(_p2, _p3);
                     __m512 _max45 = _mm512_max_ps(_p4, _p5);
@@ -219,7 +219,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     _mm512_store_ps(ptr + 16 * 13, _pd);
                     _mm512_store_ps(ptr + 16 * 14, _pe);
                     _mm512_store_ps(ptr + 16 * 15, _pf);
-                    transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
+                    transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
                     __m512 _sum01 = _mm512_add_ps(_p0, _p1);
                     __m512 _sum23 = _mm512_add_ps(_p2, _p3);
                     __m512 _sum45 = _mm512_add_ps(_p4, _p5);
@@ -341,7 +341,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     __m512 _pd = _mm512_load_ps(ptr + 16 * 13);
                     __m512 _pe = _mm512_load_ps(ptr + 16 * 14);
                     __m512 _pf = _mm512_load_ps(ptr + 16 * 15);
-                    transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
+                    transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
                     __m512 _max01 = _mm512_max_ps(_p0, _p1);
                     __m512 _max23 = _mm512_max_ps(_p2, _p3);
                     __m512 _max45 = _mm512_max_ps(_p4, _p5);
@@ -435,7 +435,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     _mm512_store_ps(ptr + 16 * 13, _pd);
                     _mm512_store_ps(ptr + 16 * 14, _pe);
                     _mm512_store_ps(ptr + 16 * 15, _pf);
-                    transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
+                    transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf);
                     __m512 _sum01 = _mm512_add_ps(_p0, _p1);
                     __m512 _sum23 = _mm512_add_ps(_p2, _p3);
                     __m512 _sum45 = _mm512_add_ps(_p4, _p5);
@@ -687,7 +687,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     __m256 _p5 = _mm256_load_ps(ptr + 8 * 5);
                     __m256 _p6 = _mm256_load_ps(ptr + 8 * 6);
                     __m256 _p7 = _mm256_load_ps(ptr + 8 * 7);
-                    transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
+                    transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
                     __m256 _max01 = _mm256_max_ps(_p0, _p1);
                     __m256 _max23 = _mm256_max_ps(_p2, _p3);
                     __m256 _max45 = _mm256_max_ps(_p4, _p5);
@@ -749,7 +749,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     _mm256_store_ps(ptr + 8 * 5, _p5);
                     _mm256_store_ps(ptr + 8 * 6, _p6);
                     _mm256_store_ps(ptr + 8 * 7, _p7);
-                    transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
+                    transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
                     __m256 _sum01 = _mm256_add_ps(_p0, _p1);
                     __m256 _sum23 = _mm256_add_ps(_p2, _p3);
                     __m256 _sum45 = _mm256_add_ps(_p4, _p5);
@@ -855,7 +855,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     __m256 _p5 = _mm256_load_ps(ptr + 8 * 5);
                     __m256 _p6 = _mm256_load_ps(ptr + 8 * 6);
                     __m256 _p7 = _mm256_load_ps(ptr + 8 * 7);
-                    transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
+                    transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
                     __m256 _max01 = _mm256_max_ps(_p0, _p1);
                     __m256 _max23 = _mm256_max_ps(_p2, _p3);
                     __m256 _max45 = _mm256_max_ps(_p4, _p5);
@@ -917,7 +917,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     _mm256_store_ps(ptr + 8 * 5, _p5);
                     _mm256_store_ps(ptr + 8 * 6, _p6);
                     _mm256_store_ps(ptr + 8 * 7, _p7);
-                    transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
+                    transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7);
                     __m256 _sum01 = _mm256_add_ps(_p0, _p1);
                     __m256 _sum23 = _mm256_add_ps(_p2, _p3);
                     __m256 _sum45 = _mm256_add_ps(_p4, _p5);
diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
index 764e33e7976..75527d507b0 100644
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -286,6 +286,47 @@ static NCNN_FORCEINLINE v4sf exp_ps(v4sf x)
     return y;
 }
 
+_PS_CONST(tanh_hi, 9.0f);
+_PS_CONST(tanh_lo, -9.0f);
+
+_PS_CONST(cephes_tanh_p0, -2.76076847742355E-16f);
+_PS_CONST(cephes_tanh_p1, 2.00018790482477E-13f);
+_PS_CONST(cephes_tanh_p2, -8.60467152213735E-11f);
+_PS_CONST(cephes_tanh_p3, 5.12229709037114E-08f);
+_PS_CONST(cephes_tanh_p4, 1.48572235717979E-05f);
+_PS_CONST(cephes_tanh_p5, 6.37261928875436E-04f);
+_PS_CONST(cephes_tanh_p6, 4.89352455891786E-03f);
+_PS_CONST(cephes_tanh_p7, 1.19825839466702e-06f);
+_PS_CONST(cephes_tanh_p8, 1.18534705686654e-04f);
+_PS_CONST(cephes_tanh_p9, 2.26843463243900e-03f);
+
+// an approximation of tanh
+static inline v4sf tanh_ps(const v4sf x)
+{
+    v4sf value = x;
+    value = _mm_max_ps(*(v4sf*)_ps_tanh_lo, value);
+    value = _mm_min_ps(*(v4sf*)_ps_tanh_hi, value);
+
+    v4sf value_squared = _mm_mul_ps(value, value);
+
+    v4sf p;
+    p = _mm_comp_fmadd_ps(value_squared, *(v4sf*)_ps_cephes_tanh_p0, *(v4sf*)_ps_cephes_tanh_p1);
+    p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p2);
+    p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p3);
+    p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p4);
+    p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p5);
+    p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p6);
+    p = _mm_mul_ps(p, value);
+
+    v4sf q;
+    q = _mm_comp_fmadd_ps(value_squared, *(v4sf*)_ps_cephes_tanh_p7, *(v4sf*)_ps_cephes_tanh_p8);
+    q = _mm_comp_fmadd_ps(q, value_squared, *(v4sf*)_ps_cephes_tanh_p9);
+    q = _mm_comp_fmadd_ps(q, value_squared, *(v4sf*)_ps_cephes_tanh_p6);
+
+    v4sf dst = _mm_div_ps(p, q);
+    return dst;
+}
+
 _PS_CONST(minus_cephes_DP1, -0.78515625f);
 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
diff --git a/src/layer/x86/x86_activation.h b/src/layer/x86/x86_activation.h
index 6c58cd8f4ce..4299d4d46b1 100644
--- a/src/layer/x86/x86_activation.h
+++ b/src/layer/x86/x86_activation.h
@@ -292,6 +292,13 @@ static NCNN_FORCEINLINE __m512 elu_avx512(__m512 inputs, __m512 alphas)
     return _mm512_add_ps(pos, _mm512_mul_ps(alphas, neg));
 }
 
+static NCNN_FORCEINLINE __m512 prelu_avx512(__m512 inputs, __m512 alphas)
+{
+    __m512 pos = _mm512_max_ps(_mm512_setzero_ps(), inputs);
+    __m512 neg = _mm512_min_ps(_mm512_setzero_ps(), inputs);
+    return _mm512_add_ps(pos, _mm512_mul_ps(alphas, neg));
+}
+
 static NCNN_FORCEINLINE __m512 activation_avx512(__m512 _v, int activation_type, const ncnn::Mat& activation_params)
 {
     // Process fused activations
diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h
index 28ddfd50b95..669cec0a738 100644
--- a/src/layer/x86/x86_usability.h
+++ b/src/layer/x86/x86_usability.h
@@ -155,20 +155,20 @@ static NCNN_FORCEINLINE __m128i float2int8_sse(const __m128& _v0, const __m128&
 }
 
 #ifndef __FMA__
-static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(__m128 _a, const __m128 _b, const __m128 _c)
+static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c)
 {
     return _mm_add_ps(_mm_mul_ps(_a, _b), _c);
 }
-static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(__m128 _a, const __m128 _b, const __m128 _c)
+static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c)
 {
     return _mm_sub_ps(_c, _mm_mul_ps(_a, _b));
 }
 #else
-static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(__m128 _a, const __m128 _b, const __m128 _c)
+static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c)
 {
     return _mm_fmadd_ps(_a, _b, _c);
 }
-static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(__m128 _a, const __m128 _b, const __m128 _c)
+static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c)
 {
     // return -a * b + c
     return _mm_fnmadd_ps(_a, _b, _c);
@@ -177,65 +177,165 @@ static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(__m128 _a, const __m128 _b, co
 
 #if __AVX__
 #ifndef __FMA__
-static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(__m256 _a, const __m256 _b, const __m256 _c)
+static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c)
 {
     return _mm256_add_ps(_mm256_mul_ps(_a, _b), _c);
 }
-static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(__m256 _a, const __m256 _b, const __m256 _c)
+static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c)
 {
     return _mm256_sub_ps(_c, _mm256_mul_ps(_a, _b));
 }
 #else
-static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(__m256 _a, const __m256 _b, const __m256 _c)
+static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c)
 {
     return _mm256_fmadd_ps(_a, _b, _c);
 }
-static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(__m256 _a, const __m256 _b, const __m256 _c)
+static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c)
 {
     // return -a * b + c
     return _mm256_fnmadd_ps(_a, _b, _c);
 }
 #endif
 
-static NCNN_FORCEINLINE __m256 _mm256_fmadd_1_ps(__m256 a, __m256 b, float c)
+static NCNN_FORCEINLINE __m256 _mm256_fmadd_1_ps(const __m256& a, const __m256& b, float c)
 {
     return _mm256_comp_fmadd_ps(b, _mm256_set1_ps(c), a);
 }
 
-static NCNN_FORCEINLINE __m256 _mm256_fmrsub_1_ps(__m256 a, __m256 b, float c)
+static NCNN_FORCEINLINE __m256 _mm256_fmrsub_1_ps(const __m256& a, const __m256& b, float c)
 {
     // return a - b * c
     return _mm256_comp_fnmadd_ps(b, _mm256_set1_ps(c), a);
 }
-// From: https://stackoverflow.com/a/25627536
-static NCNN_FORCEINLINE void transpose8_ps(__m256& row0, __m256& row1, __m256& row2, __m256& row3, __m256& row4, __m256& row5, __m256& row6, __m256& row7)
-{
-    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
-    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
-    __t0 = _mm256_unpacklo_ps(row0, row1);
-    __t1 = _mm256_unpackhi_ps(row0, row1);
-    __t2 = _mm256_unpacklo_ps(row2, row3);
-    __t3 = _mm256_unpackhi_ps(row2, row3);
-    __t4 = _mm256_unpacklo_ps(row4, row5);
-    __t5 = _mm256_unpackhi_ps(row4, row5);
-    __t6 = _mm256_unpacklo_ps(row6, row7);
-    __t7 = _mm256_unpackhi_ps(row6, row7);
-    __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
-    __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
-    __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
-    __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
-    __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
-    __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
-    __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
-    __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
-    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
-    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
-    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
-    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
-    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
-    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
-    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
-    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
+
+static NCNN_FORCEINLINE void transpose8x12_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7,
+        __m256& _r8, __m256& _r9, __m256& _ra, __m256& _rb)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
+    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
+    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
+    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
+    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
+    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
+    __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
+    __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
+    __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
+    __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
+
+    __m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
+    _r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
+    _r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1));
+    _r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
+    _r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1));
+    _r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1));
+    _ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
+    _rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose8x8_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
+    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
+    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
+    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
+    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
+    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
+
+    __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
+    _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
+    _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
+    _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose8x4_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
+    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
+
+    __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
+    _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose8x2_ps(__m256& _r0, __m256& _r1)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+
+    _r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose8x8_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7)
+{
+    __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
+    __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
+    __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
+    __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
+    __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
+    __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
+    __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
+    __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
+
+    __m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2);
+    __m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2);
+    __m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3);
+    __m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3);
+    __m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6);
+    __m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6);
+    __m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7);
+    __m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7);
+
+    _r0 = _mm_unpacklo_epi64(_tmp8, _tmpc);
+    _r1 = _mm_unpackhi_epi64(_tmp8, _tmpc);
+    _r2 = _mm_unpacklo_epi64(_tmp9, _tmpd);
+    _r3 = _mm_unpackhi_epi64(_tmp9, _tmpd);
+    _r4 = _mm_unpacklo_epi64(_tmpa, _tmpe);
+    _r5 = _mm_unpackhi_epi64(_tmpa, _tmpe);
+    _r6 = _mm_unpacklo_epi64(_tmpb, _tmpf);
+    _r7 = _mm_unpackhi_epi64(_tmpb, _tmpf);
 }
 
 static NCNN_FORCEINLINE __m256 HorizontalSums(__m256& v0, __m256& v1, __m256& v2, __m256& v3, __m256& v4, __m256& v5, __m256& v6, __m256& v7)
@@ -401,7 +501,7 @@ static NCNN_FORCEINLINE void _mm256_comp_fmadd_ps8(__m256& _sum,
 }
 
 #if __AVX512F__
-static NCNN_FORCEINLINE void transpose16_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7,
+static NCNN_FORCEINLINE void transpose16x16_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7,
         __m512& _r8, __m512& _r9, __m512& _ra, __m512& _rb, __m512& _rc, __m512& _rd, __m512& _re, __m512& _rf)
 {
     __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
@@ -473,6 +573,302 @@ static NCNN_FORCEINLINE void transpose16_ps(__m512& _r0, __m512& _r1, __m512& _r
     _rf = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
 }
 
+static NCNN_FORCEINLINE void transpose16x12_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7,
+        __m512& _r8, __m512& _r9, __m512& _ra, __m512& _rb)
+{
+    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
+    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
+    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
+    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
+    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
+    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
+    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
+    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
+    __m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9);
+    __m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9);
+    __m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb);
+    __m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb);
+
+    __m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1));
+
+    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
+    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
+    _r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0));
+    _r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0));
+    _r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+    _r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+    _r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
+    _r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+    _ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1));
+    _rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+static NCNN_FORCEINLINE void transpose16x8_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7)
+{
+    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
+    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
+    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
+    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
+    __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
+    __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
+    __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
+    __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
+
+    __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
+
+    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+    _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
+    _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
+    _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+    _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+    _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
+    _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+static NCNN_FORCEINLINE void transpose16x4_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3)
+{
+    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
+    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
+    __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
+    __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
+
+    __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
+    _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
+    _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+
+    _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+    _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+    _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+static NCNN_FORCEINLINE void transpose16x2_ps(__m512& _r0, __m512& _r1)
+{
+    __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
+    __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
+
+    __m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+    __m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+
+    _r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+    _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+static NCNN_FORCEINLINE void transpose8x16_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7,
+        __m256& _r8, __m256& _r9, __m256& _ra, __m256& _rb, __m256& _rc, __m256& _rd, __m256& _re, __m256& _rf)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
+    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
+    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
+    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
+    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
+    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
+    __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
+    __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
+    __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
+    __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
+    __m256 _tmpc = _mm256_unpacklo_ps(_rc, _rd);
+    __m256 _tmpd = _mm256_unpackhi_ps(_rc, _rd);
+    __m256 _tmpe = _mm256_unpacklo_ps(_re, _rf);
+    __m256 _tmpf = _mm256_unpackhi_ps(_re, _rf);
+
+    __m256 _tmpg = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmph = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpi = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpj = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpk = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpl = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpm = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpn = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpo = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpp = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpq = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpr = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmps = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpt = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpu = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpv = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _r0 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 2, 0, 0));
+    _r5 = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 2, 0, 0));
+    _r6 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
+    _r7 = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 2, 0, 0));
+    _r8 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 3, 0, 1));
+    _r9 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 3, 0, 1));
+    _ra = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1));
+    _rb = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 3, 0, 1));
+    _rc = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 3, 0, 1));
+    _rd = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 3, 0, 1));
+    _re = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1));
+    _rf = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose16x16_epi16(__m256i& _r0, __m256i& _r1, __m256i& _r2, __m256i& _r3, __m256i& _r4, __m256i& _r5, __m256i& _r6, __m256i& _r7,
+        __m256i& _r8, __m256i& _r9, __m256i& _ra, __m256i& _rb, __m256i& _rc, __m256i& _rd, __m256i& _re, __m256i& _rf)
+{
+    __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
+    __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
+    __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
+    __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
+    __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
+    __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
+    __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
+    __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
+    __m256i _tmp8 = _mm256_unpacklo_epi16(_r8, _r9);
+    __m256i _tmp9 = _mm256_unpackhi_epi16(_r8, _r9);
+    __m256i _tmpa = _mm256_unpacklo_epi16(_ra, _rb);
+    __m256i _tmpb = _mm256_unpackhi_epi16(_ra, _rb);
+    __m256i _tmpc = _mm256_unpacklo_epi16(_rc, _rd);
+    __m256i _tmpd = _mm256_unpackhi_epi16(_rc, _rd);
+    __m256i _tmpe = _mm256_unpacklo_epi16(_re, _rf);
+    __m256i _tmpf = _mm256_unpackhi_epi16(_re, _rf);
+
+    __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
+    __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
+    __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
+    __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
+    __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
+    __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
+    __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
+    __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
+    __m256i _tmpo = _mm256_unpacklo_epi32(_tmp8, _tmpa);
+    __m256i _tmpp = _mm256_unpackhi_epi32(_tmp8, _tmpa);
+    __m256i _tmpq = _mm256_unpacklo_epi32(_tmp9, _tmpb);
+    __m256i _tmpr = _mm256_unpackhi_epi32(_tmp9, _tmpb);
+    __m256i _tmps = _mm256_unpacklo_epi32(_tmpc, _tmpe);
+    __m256i _tmpt = _mm256_unpackhi_epi32(_tmpc, _tmpe);
+    __m256i _tmpu = _mm256_unpacklo_epi32(_tmpd, _tmpf);
+    __m256i _tmpv = _mm256_unpackhi_epi32(_tmpd, _tmpf);
+
+    _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
+    _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
+    _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
+    _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
+    _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
+    _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
+    _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
+    _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
+    _tmp8 = _mm256_unpacklo_epi64(_tmpo, _tmps);
+    _tmp9 = _mm256_unpackhi_epi64(_tmpo, _tmps);
+    _tmpa = _mm256_unpacklo_epi64(_tmpp, _tmpt);
+    _tmpb = _mm256_unpackhi_epi64(_tmpp, _tmpt);
+    _tmpc = _mm256_unpacklo_epi64(_tmpq, _tmpu);
+    _tmpd = _mm256_unpackhi_epi64(_tmpq, _tmpu);
+    _tmpe = _mm256_unpacklo_epi64(_tmpr, _tmpv);
+    _tmpf = _mm256_unpackhi_epi64(_tmpr, _tmpv);
+
+    _r0 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 2, 0, 0));
+    _r5 = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 2, 0, 0));
+    _r6 = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 2, 0, 0));
+    _r7 = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 2, 0, 0));
+    _r8 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 3, 0, 1));
+    _r9 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 3, 0, 1));
+    _ra = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 3, 0, 1));
+    _rb = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 3, 0, 1));
+    _rc = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 3, 0, 1));
+    _rd = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 3, 0, 1));
+    _re = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 3, 0, 1));
+    _rf = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m256i& _r2, __m256i& _r3, __m256i& _r4, __m256i& _r5, __m256i& _r6, __m256i& _r7)
+{
+    __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
+    __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
+    __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
+    __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
+    __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
+    __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
+    __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
+    __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
+
+    __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
+    __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
+    __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
+    __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
+    __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
+    __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
+    __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
+    __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
+
+    _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
+    _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
+    _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
+    _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
+    _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
+    _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
+    _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
+    _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
+
+    _r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
+    _r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1));
+    _r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
+    _r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
 static NCNN_FORCEINLINE float _mm512_comp_reduce_add_ps(__m512 x)
 {
     const __m256 x256 = _mm256_add_ps(_mm512_castps512_ps256(x), _mm512_extractf32x8_ps(x, 1));
diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in
index 99c1d8336f4..6947ecce5d1 100644
--- a/src/layer_registry.h.in
+++ b/src/layer_registry.h.in
@@ -28,6 +28,12 @@ static const layer_registry_entry layer_registry_msa[] = {
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_MSA
 
+#if NCNN_RUNTIME_CPU && NCNN_LSX
+static const layer_registry_entry layer_registry_lsx[] = {
+@layer_registry_lsx@
+};
+#endif // NCNN_RUNTIME_CPU && NCNN_LSX
+
 #if NCNN_RUNTIME_CPU && NCNN_RVV
 static const layer_registry_entry layer_registry_rvv[] = {
 @layer_registry_rvv@
diff --git a/src/layer_shader_registry.h.in b/src/layer_shader_registry.h.in
index 9a88eb4604e..52e3f013cf1 100644
--- a/src/layer_shader_registry.h.in
+++ b/src/layer_shader_registry.h.in
@@ -3,4 +3,3 @@
 // This file is auto-generated by cmake, don't edit it.
 
 @layer_shader_registry@
-
diff --git a/src/layer_shader_spv_data.h.in b/src/layer_shader_spv_data.h.in
index ab1b7b8aaa2..a4795bb1526 100644
--- a/src/layer_shader_spv_data.h.in
+++ b/src/layer_shader_spv_data.h.in
@@ -3,4 +3,3 @@
 // This file is auto-generated by cmake, don't edit it.
 
 @layer_shader_spv_data@
-
diff --git a/src/main.cpp b/src/main.cpp
deleted file mode 100644
index e107caea2d6..00000000000
--- a/src/main.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#include <iostream>
-
-#include "mat.h"
-#include "net.h"
-
-template <typename T = float>
-void print_mat_1d(ncnn::Mat &m, int start_w, int end_w) {
-  const T *p = m;
-  if (end_w == -1) {
-    end_w = m.w;
-  }
-  for (int w = start_w; w != end_w; ++w) {
-    std::cout << p[w] << ", ";
-  }
-  std::cout << "\n";
-}
-
-template <typename T = float>
-void print_mat_2d(ncnn::Mat &m, int start_h, int end_h, int start_w,
-                  int end_w) {
-  if (end_h == -1) {
-    end_h = m.h;
-  }
-  for (int h = start_h; h != end_h; ++h) {
-    ncnn::Mat sub = m.row_range(h, 1);
-    print_mat_1d<T>(sub, start_w, end_w);
-  }
-}
-
-template <typename T = float>
-void print_mat_3d(ncnn::Mat &m, int start_c, int end_c, int start_h, int end_h,
-                  int start_w, int end_w) {
-  if (end_c == -1) {
-    end_c = m.c;
-  }
-
-  for (int c = start_c; c != end_c; ++c) {
-    std::cout << "c " << c << "\n";
-    ncnn::Mat sub = m.channel_range(c, 1);
-    print_mat_2d<T>(sub, start_h, end_h, start_w, end_w);
-  }
-}
-
-int main() {
-  int c = 1;
-  int h = 6;
-  int w = 8;
-  int size = c * h * w;
-  // std::vector<float> data(size);
-  std::vector<int> data = {1, 3, 5, 4, 2};
-  // for (int i = 0; i != size; ++i) {
-  //   data[i] = i;
-  // }
-  ncnn::Option opt;
-  opt.num_threads = 1;
-  ncnn::Net net;
-  net.opt = opt;
-  net.load_param("foo/make_pad_mask.ncnn.param");
-  net.load_model("foo/make_pad_mask.ncnn.bin");
-
-  ncnn::Extractor ex = net.create_extractor();
-
-  ncnn::Mat m(data.size(), data.data());
-  m = m.clone();
-  std::cout << "in\n";
-  print_mat_1d<int>(m, 0, -1);
-  std::cout << "\n";
-
-  ncnn::Mat out;
-
-  ex.input("in0", m);
-  ex.extract("out0", out);
-  print_mat_2d<int>(out, 0, -1, 0, -1);
-  ex.clear();
-  net.clear();
-}
diff --git a/src/mat.h b/src/mat.h
index 6d7deb502a2..c6f59ef4268 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -29,6 +29,9 @@
 #if __mips_msa
 #include <msa.h>
 #endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
 #if __riscv_vector
 #include <riscv_vector.h>
 #include "cpu.h" // cpu_riscv_vlenb()
@@ -128,6 +131,9 @@ class NCNN_EXPORT Mat
 #if __mips_msa
     void fill(v4f32 _v);
 #endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
 #if __riscv_vector
     void fill(vfloat32m1_t _v);
     void fill(vuint16m1_t _v);
@@ -1067,11 +1073,23 @@ NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
 }
 #endif // __mips_msa
 
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
 #if __riscv_vector
 NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int size = (int)total();
     float* ptr = (float*)data;
@@ -1085,7 +1103,7 @@ NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
 NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int size = (int)total();
     unsigned short* ptr = (unsigned short*)data;
@@ -1099,7 +1117,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
 NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 1;
-    const word_type vl = vsetvl_e8m1(packn);
+    const size_t vl = vsetvl_e8m1(packn);
 
     int size = (int)total();
     signed char* ptr = (signed char*)data;
@@ -1113,7 +1131,7 @@ NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
 NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int size = (int)total();
     __fp16* ptr = (__fp16*)data;
diff --git a/src/net.cpp b/src/net.cpp
index e68507ca3b0..8a09ebdc1ef 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -108,6 +108,26 @@ NetPrivate::NetPrivate(Option& _opt)
 #endif // NCNN_VULKAN
 }
 
+static Option get_masked_option(const Option& opt, int featmask)
+{
+    // mask option usage as layer specific featmask
+    Option opt1 = opt;
+    opt1.use_fp16_arithmetic = opt1.use_fp16_arithmetic && !(featmask & (1 << 0));
+    opt1.use_fp16_storage = opt1.use_fp16_storage && !(featmask & (1 << 1));
+    opt1.use_fp16_packed = opt1.use_fp16_packed && !(featmask & (1 << 1));
+    opt1.use_bf16_storage = opt1.use_bf16_storage && !(featmask & (1 << 2));
+    opt1.use_int8_packed = opt1.use_int8_packed && !(featmask & (1 << 3));
+    opt1.use_int8_storage = opt1.use_int8_storage && !(featmask & (1 << 3));
+    opt1.use_int8_arithmetic = opt1.use_int8_arithmetic && !(featmask & (1 << 3));
+    opt1.use_vulkan_compute = opt1.use_vulkan_compute && !(featmask & (1 << 4));
+    opt1.use_image_storage = opt1.use_image_storage && !(featmask & (1 << 4));
+    opt1.use_tensor_storage = opt1.use_tensor_storage && !(featmask & (1 << 4));
+    opt1.use_sgemm_convolution = opt1.use_sgemm_convolution && !(featmask & (1 << 5));
+    opt1.use_winograd_convolution = opt1.use_winograd_convolution && !(featmask & (1 << 6));
+
+    return opt1;
+}
+
 #if NCNN_VULKAN
 int NetPrivate::upload_model()
 {
@@ -132,7 +152,7 @@ int NetPrivate::upload_model()
     {
         if (layers[i]->support_vulkan)
         {
-            int uret = layers[i]->upload_model(cmd, opt_upload);
+            int uret = layers[i]->upload_model(cmd, get_masked_option(opt_upload, layers[i]->featmask));
             if (uret != 0)
             {
                 NCNN_LOGE("layer upload_model %d failed", (int)i);
@@ -195,7 +215,15 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, cons
         bottom_blob.elemsize = blob_mats[bottom_blob_index].elemsize;
     }
 #endif
-    int ret = do_forward_layer(layer, blob_mats, opt);
+    int ret = 0;
+    if (layer->featmask)
+    {
+        ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask));
+    }
+    else
+    {
+        ret = do_forward_layer(layer, blob_mats, opt);
+    }
 #if NCNN_BENCHMARK
     double end = get_current_time();
     if (layer->one_blob_only)
@@ -352,7 +380,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
 #if NCNN_BENCHMARK
         cmd.record_write_timestamp(layer_index * 2);
 #endif
-        ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
+        if (layer->featmask)
+        {
+            ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask));
+        }
+        else
+        {
+            ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
+        }
 #if NCNN_BENCHMARK
         cmd.record_write_timestamp(layer_index * 2 + 1);
 #endif
@@ -368,7 +403,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
             bottom_blob = blob_mats[bottom_blob_index].shape();
         }
 #endif
-        ret = do_forward_layer(layer, blob_mats, opt);
+        if (layer->featmask)
+        {
+            ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask));
+        }
+        else
+        {
+            ret = do_forward_layer(layer, blob_mats, opt);
+        }
 #if NCNN_BENCHMARK
         double end = get_current_time();
         if (layer->one_blob_only)
@@ -677,7 +719,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
 #endif
         if (layer->support_image_storage)
         {
-            ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, opt);
+            if (layer->featmask)
+            {
+                ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, get_masked_option(opt, layer->featmask));
+            }
+            else
+            {
+                ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, opt);
+            }
             if (ret == -100)
             {
                 image_allocation_failed = true;
@@ -686,7 +735,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
         }
         else
         {
-            ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
+            if (layer->featmask)
+            {
+                ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask));
+            }
+            else
+            {
+                ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
+            }
         }
 #if NCNN_BENCHMARK
         cmd.record_write_timestamp(layer_index * 2 + 1);
@@ -703,7 +759,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std:
             bottom_blob = blob_mats[bottom_blob_index].shape();
         }
 #endif
-        ret = do_forward_layer(layer, blob_mats, opt);
+        if (layer->featmask)
+        {
+            ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask));
+        }
+        else
+        {
+            ret = do_forward_layer(layer, blob_mats, opt);
+        }
 #if NCNN_BENCHMARK
         double end = get_current_time();
         if (layer->one_blob_only)
@@ -790,6 +853,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
     // *INDENT-ON*
     // clang-format on
 
+    int dst_elempack = 1;
     if (opt.use_packing_layout)
     {
         // resolve dst_elempack
@@ -801,7 +865,6 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
 
         int elembits = bottom_blob.elembits();
 
-        int dst_elempack = 1;
         if (layer->support_packing)
         {
             if (elembits == 32)
@@ -855,13 +918,13 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
 #endif
             }
         }
+    }
 
-        if (bottom_blob.elempack != dst_elempack)
-        {
-            Mat bottom_blob_packed;
-            convert_packing(bottom_blob, bottom_blob_packed, dst_elempack, opt);
-            bottom_blob = bottom_blob_packed;
-        }
+    if (bottom_blob.elempack != dst_elempack)
+    {
+        Mat bottom_blob_packed;
+        convert_packing(bottom_blob, bottom_blob_packed, dst_elempack, opt);
+        bottom_blob = bottom_blob_packed;
     }
 
     return 0;
@@ -1571,6 +1634,9 @@ int Net::load_param(const DataReader& dr)
             layer->top_shapes[j] = d->blobs[layer->tops[j]].shape;
         }
 
+        // pull out layer specific feature disabled set
+        layer->featmask = pd.get(31, 0);
+
         int lr = layer->load_param(pd);
         if (lr != 0)
         {
@@ -1774,6 +1840,9 @@ int Net::load_param_bin(const DataReader& dr)
             layer->top_shapes[j] = d->blobs[layer->tops[j]].shape;
         }
 
+        // pull out layer specific feature disabled set
+        layer->featmask = pd.get(31, 0);
+
         int lr = layer->load_param(pd);
         if (lr != 0)
         {
@@ -1855,12 +1924,17 @@ int Net::load_model(const DataReader& dr)
     {
         Layer* layer = d->layers[i];
 
-        Option opt1 = opt;
+        Option opt1 = get_masked_option(opt, layer->featmask);
 #if NCNN_VULKAN
-        if (opt.use_vulkan_compute)
+        if (opt1.use_vulkan_compute)
         {
             if (!layer->support_image_storage) opt1.use_image_storage = false;
         }
+        else
+        {
+            layer->vkdev = 0;
+            layer->support_vulkan = false;
+        }
 #endif // NCNN_VULKAN
 
         int cret = layer->create_pipeline(opt1);
@@ -1891,7 +1965,7 @@ int Net::load_model(const DataReader& dr)
             if (!d->local_workspace_allocator)
             {
                 d->local_workspace_allocator = new PoolAllocator;
-                d->local_workspace_allocator->set_size_compare_ratio(0.5f);
+                d->local_workspace_allocator->set_size_compare_ratio(0.f);
             }
         }
     }
@@ -2066,11 +2140,13 @@ void Net::clear()
     {
         Layer* layer = d->layers[i];
 
-        Option opt1 = opt;
+        Option opt1 = get_masked_option(opt, layer->featmask);
+#if NCNN_VULKAN
         if (!layer->support_image_storage)
         {
             opt1.use_image_storage = false;
         }
+#endif // NCNN_VULKAN
 
         int dret = layer->destroy_pipeline(opt1);
         if (dret != 0)
diff --git a/src/option.cpp b/src/option.cpp
index 4aabfdde5ed..80d4455307e 100644
--- a/src/option.cpp
+++ b/src/option.cpp
@@ -21,7 +21,7 @@ namespace ncnn {
 Option::Option()
 {
     lightmode = true;
-    num_threads = get_big_cpu_count();
+    num_threads = get_physical_big_cpu_count();
     blob_allocator = 0;
     workspace_allocator = 0;
 
diff --git a/src/platform.h.in b/src/platform.h.in
index 755f8294bc2..219cff4aada 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -55,6 +55,7 @@
 #cmakedefine01 NCNN_ARM86SVEF32MM
 #endif // __aarch64__
 #cmakedefine01 NCNN_MSA
+#cmakedefine01 NCNN_LSX
 #cmakedefine01 NCNN_MMI
 #cmakedefine01 NCNN_RVV
 #cmakedefine01 NCNN_INT8
diff --git a/src/simplestl.h b/src/simplestl.h
index b8454c40ae5..00ff4680186 100644
--- a/src/simplestl.h
+++ b/src/simplestl.h
@@ -508,7 +508,7 @@ struct vector
         {
             capacity_ = new_size * 2;
             T* new_data = (T*)new char[capacity_ * sizeof(T)];
-            memset(new_data, 0, capacity_ * sizeof(T));
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
             if (data_)
             {
                 memmove(new_data, data_, sizeof(T) * size_);
diff --git a/src/stb_image.h b/src/stb_image.h
index 35e20150b46..6aad778aba1 100644
--- a/src/stb_image.h
+++ b/src/stb_image.h
@@ -4851,7 +4851,7 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
    p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
    if (p == NULL) return stbi__err("outofmem", "Out of memory");
 
-   // between here and free(out) below, exitting would leak
+   // between here and free(out) below, exiting would leak
    temp_out = p;
 
    if (pal_img_n == 3) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a88c6562db2..967fbd72bef 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -19,7 +19,18 @@ macro(ncnn_add_layer_test class)
 
     # enable if WITH_LAYER_xxx option ON
     if(${WITH_LAYER_${name}})
-        ncnn_add_test(${name})
+        file(GLOB test_${name}_SRCS "test_${name}.cpp" "test_${name}_*.cpp" LIST_DIRECTORIES FALSE)
+
+        foreach(test_file ${test_${name}_SRCS})
+            get_filename_component(test_filename ${test_file} NAME_WE)
+            add_executable(${test_filename} ${test_file})
+            target_link_libraries(${test_filename} PRIVATE ncnn)
+
+            add_test(NAME ${test_filename} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$<TARGET_FILE:${test_filename}> -P ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/run_test.cmake)
+
+            # add test to a virtual project group
+            set_property(TARGET ${test_filename} PROPERTY FOLDER "tests")
+        endforeach()
     endif()
 endmacro()
 
@@ -85,8 +96,11 @@ ncnn_add_layer_test(Eltwise)
 ncnn_add_layer_test(ELU)
 ncnn_add_layer_test(ExpandDims)
 ncnn_add_layer_test(Flatten)
+ncnn_add_layer_test(Fold)
 ncnn_add_layer_test(GELU)
+ncnn_add_layer_test(GLU)
 ncnn_add_layer_test(Gemm)
+ncnn_add_layer_test(GridSample)
 ncnn_add_layer_test(GroupNorm)
 ncnn_add_layer_test(GRU)
 ncnn_add_layer_test(HardSigmoid)
@@ -134,4 +148,5 @@ ncnn_add_layer_test(Swish)
 ncnn_add_layer_test(TanH)
 ncnn_add_layer_test(Tile)
 ncnn_add_layer_test(UnaryOp)
+ncnn_add_layer_test(Unfold)
 ncnn_add_layer_test(Yolov3DetectionOutput)
diff --git a/tests/test_binaryop.cpp b/tests/test_binaryop.cpp
index 44e3d1b369e..f79ec024be1 100644
--- a/tests/test_binaryop.cpp
+++ b/tests/test_binaryop.cpp
@@ -382,7 +382,7 @@ int main()
 {
     SRAND(7767517);
 
-    for (op_type = 0; op_type < OP_TYPE_MAX; op_type++)
+    for (op_type = 0; op_type < 3; op_type++)
     {
         int ret = 0
                   || test_binaryop_1()
diff --git a/tests/test_binaryop_1.cpp b/tests/test_binaryop_1.cpp
new file mode 100644
index 00000000000..bc0ec9c8927
--- /dev/null
+++ b/tests/test_binaryop_1.cpp
@@ -0,0 +1,431 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/binaryop.h"
+#include "testutil.h"
+
+#define OP_TYPE_MAX 9
+
+static int op_type = 0;
+
+static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b)
+{
+    ncnn::Mat a = _a;
+    ncnn::Mat b = _b;
+    if (op_type == 6)
+    {
+        // value must be positive for pow
+        Randomize(a, 0.001f, 2.f);
+        Randomize(b, 0.001f, 2.f);
+    }
+    if (op_type == 3 || op_type == 8)
+    {
+        // value must be positive for pow
+        Randomize(a, 0.1f, 10.f);
+        Randomize(b, 0.1f, 10.f);
+    }
+
+    ncnn::ParamDict pd;
+    pd.set(0, op_type);
+    pd.set(1, 0);   // with_scalar
+    pd.set(2, 0.f); // b
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> ab(2);
+    ab[0] = a;
+    ab[1] = b;
+
+    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
+    }
+
+    return ret;
+}
+
+static int test_binaryop(const ncnn::Mat& _a, float b)
+{
+    ncnn::Mat a = _a;
+    if (op_type == 6)
+    {
+        // value must be positive for pow
+        Randomize(a, 0.001f, 2.f);
+        b = RandomFloat(0.001f, 2.f);
+    }
+
+    ncnn::ParamDict pd;
+    pd.set(0, op_type);
+    pd.set(1, 1); // with_scalar
+    pd.set(2, b); // b
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
+    }
+
+    return ret;
+}
+
+// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting
+
+static int test_binaryop_1()
+{
+    return 0
+           || test_binaryop(RandomMat(1), 1.f);
+}
+
+static int test_binaryop_2()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(1))
+           || test_binaryop(RandomMat(1), RandomMat(4))
+           || test_binaryop(RandomMat(1), RandomMat(16));
+}
+
+static int test_binaryop_3()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(11, 3))
+           || test_binaryop(RandomMat(1), RandomMat(11, 4))
+           || test_binaryop(RandomMat(1), RandomMat(11, 16));
+}
+
+static int test_binaryop_4()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(1), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(1), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_5()
+{
+    return 0
+           || test_binaryop(RandomMat(2), 1.f)
+           || test_binaryop(RandomMat(4), 1.f)
+           || test_binaryop(RandomMat(16), 1.f);
+}
+
+static int test_binaryop_6()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(1))
+           || test_binaryop(RandomMat(4), RandomMat(1))
+           || test_binaryop(RandomMat(16), RandomMat(1));
+}
+
+static int test_binaryop_7()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(2))
+           || test_binaryop(RandomMat(4), RandomMat(4))
+           || test_binaryop(RandomMat(16), RandomMat(16));
+}
+
+static int test_binaryop_8()
+{
+    return 0
+           || test_binaryop(RandomMat(3), RandomMat(11, 3))
+           || test_binaryop(RandomMat(4), RandomMat(11, 4))
+           || test_binaryop(RandomMat(16), RandomMat(11, 16));
+}
+
+static int test_binaryop_9()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_10()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), 1.f)
+           || test_binaryop(RandomMat(11, 4), 1.f)
+           || test_binaryop(RandomMat(11, 16), 1.f);
+}
+
+static int test_binaryop_11()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), RandomMat(1))
+           || test_binaryop(RandomMat(11, 4), RandomMat(1))
+           || test_binaryop(RandomMat(11, 16), RandomMat(1));
+}
+
+static int test_binaryop_12()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), RandomMat(3))
+           || test_binaryop(RandomMat(11, 4), RandomMat(4))
+           || test_binaryop(RandomMat(11, 16), RandomMat(16));
+}
+
+static int test_binaryop_13()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), RandomMat(11, 3))
+           || test_binaryop(RandomMat(11, 4), RandomMat(11, 4))
+           || test_binaryop(RandomMat(11, 16), RandomMat(11, 16));
+}
+
+static int test_binaryop_14()
+{
+    return 0
+           || test_binaryop(RandomMat(6, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(6, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(6, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_15()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), 1.f)
+           || test_binaryop(RandomMat(11, 6, 4), 1.f)
+           || test_binaryop(RandomMat(11, 6, 16), 1.f);
+}
+
+static int test_binaryop_16()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(1))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(1))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(1));
+}
+
+static int test_binaryop_17()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(16));
+}
+
+static int test_binaryop_18()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(6, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(6, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(6, 16));
+}
+
+static int test_binaryop_19()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_20()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_21()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_22()
+{
+    return 0
+           || test_binaryop(RandomMat(4, 2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(4, 4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(4, 16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_23()
+{
+    return 0
+           || test_binaryop(RandomMat(3, 4, 2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(3, 4, 4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(3, 4, 16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_24()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), 1.f)
+           || test_binaryop(RandomMat(11, 3, 4, 4), 1.f)
+           || test_binaryop(RandomMat(11, 3, 4, 16), 1.f);
+}
+
+static int test_binaryop_25()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(1))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(1))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(1));
+}
+
+static int test_binaryop_26()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(16));
+}
+
+static int test_binaryop_27()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(4, 2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4, 4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(4, 16));
+}
+
+static int test_binaryop_28()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(3, 4, 2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(3, 4, 4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(3, 4, 16));
+}
+
+static int test_binaryop_29()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_s1()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 1, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 1, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 1, 16));
+}
+
+static int test_binaryop_s2()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 1))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 1))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 1));
+}
+
+static int test_binaryop_s3()
+{
+    return 0
+           || test_binaryop(RandomMat(1, 1, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(1, 1, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(1, 1, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_s4()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_s5()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 6, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 6, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 6, 16));
+}
+
+static int test_binaryop_s6()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 1, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 1, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 1, 16));
+}
+
+static int test_binaryop_s7()
+{
+    return 0
+           || test_binaryop(RandomMat(1, 6, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(1, 6, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(1, 6, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_s8()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 1, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(11, 1, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(11, 1, 16), RandomMat(11, 6, 16));
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    for (op_type = 3; op_type < 6; op_type++)
+    {
+        int ret = 0
+                  || test_binaryop_1()
+                  || test_binaryop_2()
+                  || test_binaryop_3()
+                  || test_binaryop_4()
+                  || test_binaryop_5()
+                  || test_binaryop_6()
+                  || test_binaryop_7()
+                  || test_binaryop_8()
+                  || test_binaryop_9()
+                  || test_binaryop_10()
+                  || test_binaryop_11()
+                  || test_binaryop_12()
+                  || test_binaryop_13()
+                  || test_binaryop_14()
+                  || test_binaryop_15()
+                  || test_binaryop_16()
+                  || test_binaryop_17()
+                  || test_binaryop_18()
+                  || test_binaryop_19()
+                  || test_binaryop_20()
+                  || test_binaryop_21()
+                  || test_binaryop_22()
+                  || test_binaryop_23()
+                  || test_binaryop_24()
+                  || test_binaryop_25()
+                  || test_binaryop_26()
+                  || test_binaryop_27()
+                  || test_binaryop_28()
+                  || test_binaryop_29()
+                  || test_binaryop_s1()
+                  || test_binaryop_s2()
+                  || test_binaryop_s3()
+                  || test_binaryop_s4()
+                  || test_binaryop_s5()
+                  || test_binaryop_s6()
+                  || test_binaryop_s7()
+                  || test_binaryop_s8();
+
+        if (ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
diff --git a/tests/test_binaryop_2.cpp b/tests/test_binaryop_2.cpp
new file mode 100644
index 00000000000..1608a2880b0
--- /dev/null
+++ b/tests/test_binaryop_2.cpp
@@ -0,0 +1,431 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/binaryop.h"
+#include "testutil.h"
+
+#define OP_TYPE_MAX 9
+
+static int op_type = 0;
+
+static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b)
+{
+    ncnn::Mat a = _a;
+    ncnn::Mat b = _b;
+    if (op_type == 6)
+    {
+        // value must be positive for pow
+        Randomize(a, 0.001f, 2.f);
+        Randomize(b, 0.001f, 2.f);
+    }
+    if (op_type == 3 || op_type == 8)
+    {
+        // value must be positive for pow
+        Randomize(a, 0.1f, 10.f);
+        Randomize(b, 0.1f, 10.f);
+    }
+
+    ncnn::ParamDict pd;
+    pd.set(0, op_type);
+    pd.set(1, 0);   // with_scalar
+    pd.set(2, 0.f); // b
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> ab(2);
+    ab[0] = a;
+    ab[1] = b;
+
+    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
+    }
+
+    return ret;
+}
+
+static int test_binaryop(const ncnn::Mat& _a, float b)
+{
+    ncnn::Mat a = _a;
+    if (op_type == 6)
+    {
+        // value must be positive for pow
+        Randomize(a, 0.001f, 2.f);
+        b = RandomFloat(0.001f, 2.f);
+    }
+
+    ncnn::ParamDict pd;
+    pd.set(0, op_type);
+    pd.set(1, 1); // with_scalar
+    pd.set(2, b); // b
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
+    }
+
+    return ret;
+}
+
+// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting
+
+static int test_binaryop_1()
+{
+    return 0
+           || test_binaryop(RandomMat(1), 1.f);
+}
+
+static int test_binaryop_2()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(1))
+           || test_binaryop(RandomMat(1), RandomMat(4))
+           || test_binaryop(RandomMat(1), RandomMat(16));
+}
+
+static int test_binaryop_3()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(11, 3))
+           || test_binaryop(RandomMat(1), RandomMat(11, 4))
+           || test_binaryop(RandomMat(1), RandomMat(11, 16));
+}
+
+static int test_binaryop_4()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(1), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(1), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_5()
+{
+    return 0
+           || test_binaryop(RandomMat(2), 1.f)
+           || test_binaryop(RandomMat(4), 1.f)
+           || test_binaryop(RandomMat(16), 1.f);
+}
+
+static int test_binaryop_6()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(1))
+           || test_binaryop(RandomMat(4), RandomMat(1))
+           || test_binaryop(RandomMat(16), RandomMat(1));
+}
+
+static int test_binaryop_7()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(2))
+           || test_binaryop(RandomMat(4), RandomMat(4))
+           || test_binaryop(RandomMat(16), RandomMat(16));
+}
+
+static int test_binaryop_8()
+{
+    return 0
+           || test_binaryop(RandomMat(3), RandomMat(11, 3))
+           || test_binaryop(RandomMat(4), RandomMat(11, 4))
+           || test_binaryop(RandomMat(16), RandomMat(11, 16));
+}
+
+static int test_binaryop_9()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_10()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), 1.f)
+           || test_binaryop(RandomMat(11, 4), 1.f)
+           || test_binaryop(RandomMat(11, 16), 1.f);
+}
+
+static int test_binaryop_11()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), RandomMat(1))
+           || test_binaryop(RandomMat(11, 4), RandomMat(1))
+           || test_binaryop(RandomMat(11, 16), RandomMat(1));
+}
+
+static int test_binaryop_12()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), RandomMat(3))
+           || test_binaryop(RandomMat(11, 4), RandomMat(4))
+           || test_binaryop(RandomMat(11, 16), RandomMat(16));
+}
+
+static int test_binaryop_13()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3), RandomMat(11, 3))
+           || test_binaryop(RandomMat(11, 4), RandomMat(11, 4))
+           || test_binaryop(RandomMat(11, 16), RandomMat(11, 16));
+}
+
+static int test_binaryop_14()
+{
+    return 0
+           || test_binaryop(RandomMat(6, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(6, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(6, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_15()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), 1.f)
+           || test_binaryop(RandomMat(11, 6, 4), 1.f)
+           || test_binaryop(RandomMat(11, 6, 16), 1.f);
+}
+
+static int test_binaryop_16()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(1))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(1))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(1));
+}
+
+static int test_binaryop_17()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(16));
+}
+
+static int test_binaryop_18()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(6, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(6, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(6, 16));
+}
+
+static int test_binaryop_19()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_20()
+{
+    return 0
+           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_21()
+{
+    return 0
+           || test_binaryop(RandomMat(2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_22()
+{
+    return 0
+           || test_binaryop(RandomMat(4, 2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(4, 4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(4, 16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_23()
+{
+    return 0
+           || test_binaryop(RandomMat(3, 4, 2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(3, 4, 4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(3, 4, 16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_24()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), 1.f)
+           || test_binaryop(RandomMat(11, 3, 4, 4), 1.f)
+           || test_binaryop(RandomMat(11, 3, 4, 16), 1.f);
+}
+
+static int test_binaryop_25()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(1))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(1))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(1));
+}
+
+static int test_binaryop_26()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(16));
+}
+
+static int test_binaryop_27()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(4, 2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4, 4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(4, 16));
+}
+
+static int test_binaryop_28()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(3, 4, 2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(3, 4, 4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(3, 4, 16));
+}
+
+static int test_binaryop_29()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(11, 3, 4, 2))
+           || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(11, 3, 4, 4))
+           || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(11, 3, 4, 16));
+}
+
+static int test_binaryop_s1()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 1, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 1, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 1, 16));
+}
+
+static int test_binaryop_s2()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 1))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 1))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 1));
+}
+
+static int test_binaryop_s3()
+{
+    return 0
+           || test_binaryop(RandomMat(1, 1, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(1, 1, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(1, 1, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_s4()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_s5()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 6, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 6, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 6, 16));
+}
+
+static int test_binaryop_s6()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 1, 2))
+           || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 1, 4))
+           || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 1, 16));
+}
+
+static int test_binaryop_s7()
+{
+    return 0
+           || test_binaryop(RandomMat(1, 6, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(1, 6, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(1, 6, 16), RandomMat(11, 6, 16));
+}
+
+static int test_binaryop_s8()
+{
+    return 0
+           || test_binaryop(RandomMat(11, 1, 2), RandomMat(11, 6, 2))
+           || test_binaryop(RandomMat(11, 1, 4), RandomMat(11, 6, 4))
+           || test_binaryop(RandomMat(11, 1, 16), RandomMat(11, 6, 16));
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    for (op_type = 6; op_type < OP_TYPE_MAX; op_type++)
+    {
+        int ret = 0
+                  || test_binaryop_1()
+                  || test_binaryop_2()
+                  || test_binaryop_3()
+                  || test_binaryop_4()
+                  || test_binaryop_5()
+                  || test_binaryop_6()
+                  || test_binaryop_7()
+                  || test_binaryop_8()
+                  || test_binaryop_9()
+                  || test_binaryop_10()
+                  || test_binaryop_11()
+                  || test_binaryop_12()
+                  || test_binaryop_13()
+                  || test_binaryop_14()
+                  || test_binaryop_15()
+                  || test_binaryop_16()
+                  || test_binaryop_17()
+                  || test_binaryop_18()
+                  || test_binaryop_19()
+                  || test_binaryop_20()
+                  || test_binaryop_21()
+                  || test_binaryop_22()
+                  || test_binaryop_23()
+                  || test_binaryop_24()
+                  || test_binaryop_25()
+                  || test_binaryop_26()
+                  || test_binaryop_27()
+                  || test_binaryop_28()
+                  || test_binaryop_29()
+                  || test_binaryop_s1()
+                  || test_binaryop_s2()
+                  || test_binaryop_s3()
+                  || test_binaryop_s4()
+                  || test_binaryop_s5()
+                  || test_binaryop_s6()
+                  || test_binaryop_s7()
+                  || test_binaryop_s8();
+
+        if (ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
diff --git a/tests/test_c_api.cpp b/tests/test_c_api.cpp
index 2c6dac4a8a2..7cdfc940f1d 100644
--- a/tests/test_c_api.cpp
+++ b/tests/test_c_api.cpp
@@ -243,9 +243,15 @@ static int test_c_api_2()
         emptydr->read = emptydr_read;
     }
 
+    ncnn_allocator_t blob_allocator = ncnn_allocator_create_pool_allocator();
+    ncnn_allocator_t workspace_allocator = ncnn_allocator_create_unlocked_pool_allocator();
+
     ncnn_option_t opt = ncnn_option_create();
     {
         ncnn_option_set_num_threads(opt, 1);
+
+        ncnn_option_set_blob_allocator(opt, blob_allocator);
+        ncnn_option_set_workspace_allocator(opt, workspace_allocator);
     }
 
     ncnn_net_t net = ncnn_net_create();
@@ -260,7 +266,7 @@ static int test_c_api_2()
         ncnn_net_load_model_datareader(net, emptydr);
     }
 
-    ncnn_mat_t a = ncnn_mat_create_1d(24, NULL);
+    ncnn_mat_t a = ncnn_mat_create_1d(24, blob_allocator);
 
     // set a
     {
@@ -274,7 +280,7 @@ static int test_c_api_2()
         memcpy(a_data, data, 24 * sizeof(float));
     }
 
-    ncnn_mat_t b = ncnn_mat_reshape_3d(a, 4, 2, 3, NULL);
+    ncnn_mat_t b = ncnn_mat_reshape_3d(a, 4, 2, 3, blob_allocator);
     ncnn_mat_t c = 0;
 
     {
@@ -321,6 +327,9 @@ static int test_c_api_2()
 
     ncnn_option_destroy(opt);
 
+    ncnn_allocator_destroy(blob_allocator);
+    ncnn_allocator_destroy(workspace_allocator);
+
     ncnn_datareader_destroy(emptydr);
 
     if (!success)
diff --git a/tests/test_convolution.cpp b/tests/test_convolution.cpp
index 5f7f5d20993..9140750e8c1 100644
--- a/tests/test_convolution.cpp
+++ b/tests/test_convolution.cpp
@@ -82,7 +82,7 @@ static int test_convolution_0()
         {7, 2, 1, -233},
     };
 
-    for (int i = 0; i < 16; i++)
+    for (int i = 0; i < 12; i++)
     {
         const int k = kdsp[i][0];
         const int d = kdsp[i][1];
@@ -125,313 +125,12 @@ static int test_convolution_0()
             return -1;
     }
 
-    return 0
-           || test_convolution(7, 5, 1, 4, 3, 1, 1, 1, 1)
-           || test_convolution(14, 5, 1, 4, 3, 1, 2, 1, 1)
-           || test_convolution(11, 5, 2, 12, 2, 2, 2, 1, 1)
-           || test_convolution(15, 11, 4, 4, 3, 1, 1, 1, 1)
-           || test_convolution(15, 11, 8, 8, 3, 1, 1, 1, 1)
-           || test_convolution(11, 11, 8, 16, 3, 1, 1, 1, 1)
-           || test_convolution(13, 16, 16, 24, 3, 1, 1, 1, 1)
-           || test_convolution(20, 19, 24, 24, 3, 1, 1, 1, 1)
-           || test_convolution(8, 8, 16, 24, 3, 1, 1, 1, 0)
-           || test_convolution(4, 8, 16, 24, 3, 1, 1, 1, 1)
-           || test_convolution(4, 20, 16, 24, 3, 1, 1, 1, 0)
-           || test_convolution(6, 7, 64, 64, 3, 1, 2, 0, 1)
-           || test_convolution(15, 17, 24, 32, 1, 1, 1, 0, 0)
-           || test_convolution(15, 17, 24, 32, 1, 1, 2, 0, 1)
-           || test_convolution(15, 17, 24, 32, 3, 1, 2, 0, 1)
-           || test_convolution(15, 17, 32, 24, 1, 1, 1, 0, 0)
-           || test_convolution(15, 17, 32, 24, 1, 1, 2, 0, 1)
-           || test_convolution(15, 17, 32, 24, 3, 1, 2, 0, 1)
-           || test_convolution(15, 17, 32, 28, 1, 1, 1, 0, 0)
-           || test_convolution(15, 17, 32, 28, 1, 1, 2, 0, 1)
-           || test_convolution(15, 17, 32, 28, 3, 1, 2, 0, 1)
-           || test_convolution(15, 17, 26, 32, 1, 1, 1, 0, 0)
-           || test_convolution(15, 17, 26, 32, 1, 1, 2, 0, 1)
-           || test_convolution(15, 17, 26, 32, 3, 1, 2, 0, 1)
-           || test_convolution(15, 17, 32, 26, 1, 1, 1, 0, 0)
-           || test_convolution(15, 17, 32, 26, 1, 1, 2, 0, 1)
-           || test_convolution(15, 17, 32, 26, 3, 1, 2, 0, 1)
-           || test_convolution(30, 30, 32, 26, 3, 1, 1, 1, 0)
-           || test_convolution(12, 18, 8, 16, 3, 1, 1, 1, 1)
-           || test_convolution(42, 18, 32, 160, 3, 1, 1, 1, 1)
-           || test_convolution(12, 18, 32, 160, 3, 1, 1, 1, 1)
-           || test_convolution(12, 18, 4, 12, 3, 1, 1, 1, 1)
-           || test_convolution(42, 18, 28, 140, 3, 1, 1, 1, 1)
-           || test_convolution(12, 18, 28, 140, 3, 1, 1, 1, 1);
-}
-
-static int test_convolution_vec(int w, int outch, int kernel, int dilation, int stride, int pad, int bias)
-{
-    ncnn::Mat a = RandomMat(w);
-
-    ncnn::ParamDict pd;
-    pd.set(0, outch);    // num_output
-    pd.set(1, kernel);   // kernel_w
-    pd.set(2, dilation); // dilation_w
-    pd.set(3, stride);   // stride_w
-    pd.set(4, pad);      // pad_w
-    pd.set(5, bias);     // bias_term
-    pd.set(6, outch * w * kernel * kernel);
-
-    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
-    ncnn::Mat activation_params(2);
-    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
-    activation_params[1] = RandomFloat(0, 1);                                               // beta
-    pd.set(9, activation_type);
-    pd.set(10, activation_params);
-
-    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
-    weights[0] = RandomMat(outch * w * kernel * kernel);
-    if (bias)
-        weights[1] = RandomMat(outch);
-
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_convolution_vec failed w=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
-    }
-
-    return ret;
-}
-
-static int test_convolution_2()
-{
-    return 0
-           || test_convolution_vec(1, 1, 1, 1, 1, 0, 1)
-           || test_convolution_vec(11, 12, 1, 1, 1, 0, 0)
-           || test_convolution_vec(20, 15, 1, 1, 1, 0, 1)
-           || test_convolution_vec(12, 20, 1, 1, 1, 0, 0)
-           || test_convolution_vec(3, 24, 1, 1, 1, 0, 1)
-           || test_convolution_vec(24, 5, 1, 1, 1, 0, 0)
-           || test_convolution_vec(32, 24, 1, 1, 1, 0, 1)
-           || test_convolution_vec(12, 32, 1, 1, 1, 0, 0)
-           || test_convolution_vec(64, 20, 1, 1, 1, 0, 1)
-           || test_convolution_vec(64, 128, 1, 1, 1, 0, 0);
-}
-
-static int test_convolution_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
-{
-    ncnn::Mat a = RandomMat(w, h, c);
-
-    ncnn::ParamDict pd;
-    pd.set(0, 0);
-    pd.set(1, 0);
-    pd.set(2, dilation);
-    pd.set(3, stride);
-    pd.set(4, pad);
-    pd.set(5, bias);
-    pd.set(6, 0);
-    pd.set(19, 1); // dynamic weight
-
-    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
-    ncnn::Mat activation_params(2);
-    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
-    activation_params[1] = RandomFloat(0, 1);                                               // beta
-    pd.set(9, activation_type);
-    pd.set(10, activation_params);
-
-    std::vector<ncnn::Mat> as(bias ? 3 : 2);
-    as[0] = a;
-    as[1] = RandomMat(kernel, kernel, c, outch);
-    if (bias)
-        as[2] = RandomMat(outch);
-
-    std::vector<ncnn::Mat> weights(0);
-
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, as);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_convolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
-    }
-
-    return ret;
-}
-
-static int test_convolution_3()
-{
-    static const int kdsp[7][4] = {
-        {1, 1, 1, 0},
-        {1, 1, 2, 0},
-        {2, 1, 1, 1},
-        {2, 1, 2, -233},
-        {3, 1, 1, 1},
-        {3, 1, 2, 1},
-        {3, 2, 1, -234},
-    };
-
-    for (int i = 0; i < 7; i++)
-    {
-        const int k = kdsp[i][0];
-        const int d = kdsp[i][1];
-        const int s = kdsp[i][2];
-        const int p = kdsp[i][3];
-
-        int ret = 0
-                  || test_convolution_dynamic(11, 10, 1, 1, k, d, s, p, 1)
-                  || test_convolution_dynamic(11, 10, 4, 13, k, d, s, p, 0)
-                  || test_convolution_dynamic(11, 10, 13, 4, k, d, s, p, 1)
-                  || test_convolution_dynamic(11, 10, 12, 12, k, d, s, p, 0)
-                  || test_convolution_dynamic(11, 10, 8, 12, k, d, s, p, 1)
-                  || test_convolution_dynamic(11, 10, 8, 13, k, d, s, p, 0)
-                  || test_convolution_dynamic(11, 10, 13, 8, k, d, s, p, 1)
-                  || test_convolution_dynamic(11, 10, 12, 16, k, d, s, p, 0)
-                  || test_convolution_dynamic(11, 10, 15, 15, k, d, s, p, 0)
-                  || test_convolution_dynamic(11, 10, 16, 16, k, d, s, p, 0);
-
-        if (ret != 0)
-            return -1;
-    }
-
     return 0;
 }
 
-#if NCNN_INT8
-static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false)
-{
-    ncnn::Mat a = RandomMat(w, h, c);
-
-    ncnn::ParamDict pd;
-    pd.set(0, outch);
-    pd.set(1, kernel);
-    pd.set(2, dilation);
-    pd.set(3, stride);
-    pd.set(4, pad);
-    pd.set(5, bias);
-    pd.set(6, outch * c * kernel * kernel);
-    pd.set(8, requant ? 101 : 1); // int8_scale_term
-
-    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
-    ncnn::Mat activation_params(2);
-    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
-    activation_params[1] = RandomFloat(0, 1);                                               // beta
-    pd.set(9, activation_type);
-    pd.set(10, activation_params);
-
-    std::vector<ncnn::Mat> weights(bias ? 5 : 4);
-    weights[0] = RandomMat(outch * c * kernel * kernel);
-
-    ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
-    ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
-    ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
-    if (bias)
-    {
-        weights[1] = RandomMat(outch);
-        weights[2] = weight_scales;
-        weights[3] = input_scales;
-        weights[4] = top_scales;
-    }
-    else
-    {
-        weights[1] = weight_scales;
-        weights[2] = input_scales;
-        weights[3] = top_scales;
-    }
-
-    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
-    }
-
-    return ret;
-}
-
-static int test_convolution_1()
-{
-    static const int kdsp[16][4] = {
-        {1, 1, 1, 0},
-        {1, 1, 2, 0},
-        {2, 1, 1, 1},
-        {2, 1, 2, -233},
-        {3, 1, 1, 1},
-        {3, 1, 2, 1},
-        {3, 2, 1, 1},
-        {4, 1, 1, 2},
-        {4, 1, 2, -233},
-        {4, 2, 1, -234},
-        {5, 1, 1, -234},
-        {5, 1, 2, 2},
-        {5, 2, 2, 2},
-        {7, 1, 1, 3},
-        {7, 1, 2, 3},
-        {7, 2, 1, -233},
-    };
-
-    for (int i = 0; i < 16; i++)
-    {
-        const int k = kdsp[i][0];
-        const int d = kdsp[i][1];
-        const int s = kdsp[i][2];
-        const int p = kdsp[i][3];
-
-        int ret = 0
-                  || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1)
-                  || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1);
-
-        if (ret != 0)
-            return -1;
-    }
-    for (int i = 0; i < 16; i++)
-    {
-        const int k = kdsp[i][0];
-        const int d = kdsp[i][1];
-        const int s = kdsp[i][2];
-        const int p = kdsp[i][3];
-
-        int ret = 0
-                  || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1, true)
-                  || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1, true);
-
-        if (ret != 0)
-            return -1;
-    }
-
-    return 0
-           || test_convolution_int8(11, 11, 8, 16, 3, 1, 1, 1, 1)
-           || test_convolution_int8(13, 16, 16, 24, 3, 1, 1, 1, 1)
-           || test_convolution_int8(8, 8, 16, 24, 3, 1, 1, 1, 0)
-           || test_convolution_int8(4, 8, 16, 24, 3, 1, 1, 1, 1)
-           || test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0)
-           || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1)
-           || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0)
-           || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0);
-}
-#endif // NCNN_INT8
-
 int main()
 {
     SRAND(7767517);
 
-#if NCNN_INT8
-    return 0
-           || test_convolution_0()
-           || test_convolution_1()
-           || test_convolution_2()
-           || test_convolution_3();
-#else
-    return 0
-           || test_convolution_0()
-           || test_convolution_2()
-           || test_convolution_3();
-#endif
+    return test_convolution_0();
 }
diff --git a/tests/test_convolution_1.cpp b/tests/test_convolution_1.cpp
new file mode 100644
index 00000000000..b7641a4ea0f
--- /dev/null
+++ b/tests/test_convolution_1.cpp
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/convolution.h"
+#include "testutil.h"
+
+static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    float epsilon = 0.001;
+    // larget epsilon for winograd optimization
+    if (kernel == 3 && dilation == 1 && stride == 1 && c >= 16 && outch >= 16)
+    {
+        Randomize(a, -1, 1);
+        if (c >= 64 || outch >= 64)
+            Randomize(weights[0], -0.3, 0.3);
+        else
+            Randomize(weights[0], -1, 1);
+        epsilon = 0.002;
+    }
+
+    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_convolution_0()
+{
+    static const int kdsp[16][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, -233},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, 1},
+        {4, 1, 1, 2},
+        {4, 1, 2, -233},
+        {4, 2, 1, -234},
+        {5, 1, 1, -234},
+        {5, 1, 2, 2},
+        {5, 2, 2, 2},
+        {7, 1, 1, 3},
+        {7, 1, 2, 3},
+        {7, 2, 1, -233},
+    };
+
+    for (int i = 12; i < 16; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_convolution(9, 7, 1, 1, k, d, s, p, 1)
+                  || test_convolution(9, 7, 4, 13, k, d, s, p, 0)
+                  || test_convolution(9, 7, 13, 4, k, d, s, p, 1)
+                  || test_convolution(9, 7, 12, 12, k, d, s, p, 0)
+                  || test_convolution(9, 7, 8, 12, k, d, s, p, 1)
+                  || test_convolution(9, 7, 8, 13, k, d, s, p, 0)
+                  || test_convolution(9, 7, 13, 8, k, d, s, p, 1)
+                  || test_convolution(9, 7, 12, 16, k, d, s, p, 0)
+                  || test_convolution(9, 7, 15, 15, k, d, s, p, 0)
+                  || test_convolution(9, 7, 16, 16, k, d, s, p, 0)
+                  || test_convolution(18, 17, 1, 1, k, d, s, p, 1)
+                  || test_convolution(18, 17, 4, 13, k, d, s, p, 0)
+                  || test_convolution(18, 17, 13, 4, k, d, s, p, 1)
+                  || test_convolution(18, 17, 12, 12, k, d, s, p, 0)
+                  || test_convolution(18, 17, 8, 12, k, d, s, p, 1)
+                  || test_convolution(18, 17, 8, 13, k, d, s, p, 0)
+                  || test_convolution(18, 17, 13, 8, k, d, s, p, 1)
+                  || test_convolution(18, 17, 12, 16, k, d, s, p, 0)
+                  || test_convolution(18, 17, 15, 15, k, d, s, p, 0)
+                  || test_convolution(18, 17, 16, 16, k, d, s, p, 0)
+                  || test_convolution(25, 33, 1, 1, k, d, s, p, 1)
+                  || test_convolution(25, 33, 4, 13, k, d, s, p, 0)
+                  || test_convolution(25, 33, 13, 4, k, d, s, p, 1)
+                  || test_convolution(25, 33, 12, 12, k, d, s, p, 0)
+                  || test_convolution(25, 33, 8, 12, k, d, s, p, 1)
+                  || test_convolution(25, 33, 8, 13, k, d, s, p, 0)
+                  || test_convolution(25, 33, 13, 8, k, d, s, p, 1)
+                  || test_convolution(25, 33, 12, 16, k, d, s, p, 0)
+                  || test_convolution(25, 33, 15, 15, k, d, s, p, 0)
+                  || test_convolution(25, 33, 16, 16, k, d, s, p, 0);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_convolution_0();
+}
diff --git a/tests/test_convolution_2.cpp b/tests/test_convolution_2.cpp
new file mode 100644
index 00000000000..2dbaf59b3ba
--- /dev/null
+++ b/tests/test_convolution_2.cpp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/convolution.h"
+#include "testutil.h"
+
+static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    float epsilon = 0.001;
+    // larget epsilon for winograd optimization
+    if (kernel == 3 && dilation == 1 && stride == 1 && c >= 16 && outch >= 16)
+    {
+        Randomize(a, -1, 1);
+        if (c >= 64 || outch >= 64)
+            Randomize(weights[0], -0.3, 0.3);
+        else
+            Randomize(weights[0], -1, 1);
+        epsilon = 0.002;
+    }
+
+    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_convolution_0()
+{
+    return 0
+           || test_convolution(7, 5, 1, 4, 3, 1, 1, 1, 1)
+           || test_convolution(14, 5, 1, 4, 3, 1, 2, 1, 1)
+           || test_convolution(11, 5, 2, 12, 2, 2, 2, 1, 1)
+           || test_convolution(15, 11, 4, 4, 3, 1, 1, 1, 1)
+           || test_convolution(15, 11, 8, 8, 3, 1, 1, 1, 1)
+           || test_convolution(11, 11, 8, 16, 3, 1, 1, 1, 1)
+           || test_convolution(13, 16, 16, 24, 3, 1, 1, 1, 1)
+           || test_convolution(20, 19, 24, 24, 3, 1, 1, 1, 1)
+           || test_convolution(8, 8, 16, 24, 3, 1, 1, 1, 0)
+           || test_convolution(4, 8, 16, 24, 3, 1, 1, 1, 1)
+           || test_convolution(4, 20, 16, 24, 3, 1, 1, 1, 0)
+           || test_convolution(6, 7, 64, 64, 3, 1, 2, 0, 1)
+           || test_convolution(15, 17, 24, 32, 1, 1, 1, 0, 0)
+           || test_convolution(15, 17, 24, 32, 1, 1, 2, 0, 1)
+           || test_convolution(15, 17, 24, 32, 3, 1, 2, 0, 1)
+           || test_convolution(15, 17, 32, 24, 1, 1, 1, 0, 0)
+           || test_convolution(15, 17, 32, 24, 1, 1, 2, 0, 1)
+           || test_convolution(15, 17, 32, 24, 3, 1, 2, 0, 1)
+           || test_convolution(15, 17, 32, 28, 1, 1, 1, 0, 0)
+           || test_convolution(15, 17, 32, 28, 1, 1, 2, 0, 1)
+           || test_convolution(15, 17, 32, 28, 3, 1, 2, 0, 1)
+           || test_convolution(15, 17, 26, 32, 1, 1, 1, 0, 0)
+           || test_convolution(15, 17, 26, 32, 1, 1, 2, 0, 1)
+           || test_convolution(15, 17, 26, 32, 3, 1, 2, 0, 1)
+           || test_convolution(15, 17, 32, 26, 1, 1, 1, 0, 0)
+           || test_convolution(15, 17, 32, 26, 1, 1, 2, 0, 1)
+           || test_convolution(15, 17, 32, 26, 3, 1, 2, 0, 1)
+           || test_convolution(30, 30, 32, 26, 3, 1, 1, 1, 0)
+           || test_convolution(12, 18, 8, 16, 3, 1, 1, 1, 1)
+           || test_convolution(42, 18, 32, 160, 3, 1, 1, 1, 1)
+           || test_convolution(12, 18, 32, 160, 3, 1, 1, 1, 1)
+           || test_convolution(12, 18, 4, 12, 3, 1, 1, 1, 1)
+           || test_convolution(42, 18, 28, 140, 3, 1, 1, 1, 1)
+           || test_convolution(12, 18, 28, 140, 3, 1, 1, 1, 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_convolution_0();
+}
diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp
new file mode 100644
index 00000000000..3d6f91d096a
--- /dev/null
+++ b/tests/test_convolution_3.cpp
@@ -0,0 +1,288 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/convolution.h"
+#include "testutil.h"
+
+static int test_convolution_vec(int w, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    ncnn::Mat a = RandomMat(w);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);    // num_output
+    pd.set(1, kernel);   // kernel_w
+    pd.set(2, dilation); // dilation_w
+    pd.set(3, stride);   // stride_w
+    pd.set(4, pad);      // pad_w
+    pd.set(5, bias);     // bias_term
+    pd.set(6, outch * w * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * w * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolution_vec failed w=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_convolution_2()
+{
+    return 0
+           || test_convolution_vec(1, 1, 1, 1, 1, 0, 1)
+           || test_convolution_vec(11, 12, 1, 1, 1, 0, 0)
+           || test_convolution_vec(20, 15, 1, 1, 1, 0, 1)
+           || test_convolution_vec(12, 20, 1, 1, 1, 0, 0)
+           || test_convolution_vec(3, 24, 1, 1, 1, 0, 1)
+           || test_convolution_vec(24, 5, 1, 1, 1, 0, 0)
+           || test_convolution_vec(32, 24, 1, 1, 1, 0, 1)
+           || test_convolution_vec(12, 32, 1, 1, 1, 0, 0)
+           || test_convolution_vec(64, 20, 1, 1, 1, 0, 1)
+           || test_convolution_vec(64, 128, 1, 1, 1, 0, 0);
+}
+
+static int test_convolution_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, 0);
+    pd.set(1, 0);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, 0);
+    pd.set(19, 1); // dynamic weight
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> as(bias ? 3 : 2);
+    as[0] = a;
+    as[1] = RandomMat(kernel, kernel, c, outch);
+    if (bias)
+        as[2] = RandomMat(outch);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, as);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_convolution_3()
+{
+    static const int kdsp[7][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, -233},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, -234},
+    };
+
+    for (int i = 0; i < 7; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_convolution_dynamic(11, 10, 1, 1, k, d, s, p, 1)
+                  || test_convolution_dynamic(11, 10, 4, 13, k, d, s, p, 0)
+                  || test_convolution_dynamic(11, 10, 13, 4, k, d, s, p, 1)
+                  || test_convolution_dynamic(11, 10, 12, 12, k, d, s, p, 0)
+                  || test_convolution_dynamic(11, 10, 8, 12, k, d, s, p, 1)
+                  || test_convolution_dynamic(11, 10, 8, 13, k, d, s, p, 0)
+                  || test_convolution_dynamic(11, 10, 13, 8, k, d, s, p, 1)
+                  || test_convolution_dynamic(11, 10, 12, 16, k, d, s, p, 0)
+                  || test_convolution_dynamic(11, 10, 15, 15, k, d, s, p, 0)
+                  || test_convolution_dynamic(11, 10, 16, 16, k, d, s, p, 0);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+#if NCNN_INT8
+static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+    pd.set(8, requant ? 101 : 1); // int8_scale_term
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 5 : 4);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+
+    ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
+    ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
+    ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+    if (bias)
+    {
+        weights[1] = RandomMat(outch);
+        weights[2] = weight_scales;
+        weights[3] = input_scales;
+        weights[4] = top_scales;
+    }
+    else
+    {
+        weights[1] = weight_scales;
+        weights[2] = input_scales;
+        weights[3] = top_scales;
+    }
+
+    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
+    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_convolution_1()
+{
+    static const int kdsp[16][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, -233},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, 1},
+        {4, 1, 1, 2},
+        {4, 1, 2, -233},
+        {4, 2, 1, -234},
+        {5, 1, 1, -234},
+        {5, 1, 2, 2},
+        {5, 2, 2, 2},
+        {7, 1, 1, 3},
+        {7, 1, 2, 3},
+        {7, 2, 1, -233},
+    };
+
+    for (int i = 0; i < 16; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1)
+                  || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1);
+
+        if (ret != 0)
+            return -1;
+    }
+    for (int i = 0; i < 16; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1, true)
+                  || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1, true);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0
+           || test_convolution_int8(11, 11, 8, 16, 3, 1, 1, 1, 1)
+           || test_convolution_int8(13, 16, 16, 24, 3, 1, 1, 1, 1)
+           || test_convolution_int8(8, 8, 16, 24, 3, 1, 1, 1, 0)
+           || test_convolution_int8(4, 8, 16, 24, 3, 1, 1, 1, 1)
+           || test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0)
+           || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1)
+           || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0)
+           || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0);
+}
+#endif // NCNN_INT8
+
+int main()
+{
+    SRAND(7767517);
+
+#if NCNN_INT8
+    return 0
+           || test_convolution_1()
+           || test_convolution_2()
+           || test_convolution_3();
+#else
+    return 0
+           || test_convolution_2()
+           || test_convolution_3();
+#endif
+}
diff --git a/tests/test_convolutiondepthwise.cpp b/tests/test_convolutiondepthwise.cpp
index 03317b68c1e..715fc73662c 100644
--- a/tests/test_convolutiondepthwise.cpp
+++ b/tests/test_convolutiondepthwise.cpp
@@ -125,222 +125,9 @@ static int test_convolutiondepthwise_0()
     return 0;
 }
 
-static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
-{
-    ncnn::Mat a = RandomMat(w, h, c);
-
-    ncnn::ParamDict pd;
-    pd.set(0, 0);
-    pd.set(1, 0);
-    pd.set(2, dilation);
-    pd.set(3, stride);
-    pd.set(4, pad);
-    pd.set(5, bias);
-    pd.set(6, 0);
-    pd.set(7, group);
-    pd.set(19, 1); // dynamic weight
-
-    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
-    ncnn::Mat activation_params(2);
-    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
-    activation_params[1] = RandomFloat(0, 1);                                               // beta
-    pd.set(9, activation_type);
-    pd.set(10, activation_params);
-
-    std::vector<ncnn::Mat> as(bias ? 3 : 2);
-    as[0] = a;
-    as[1] = RandomMat(kernel, kernel, c / group, outch);
-    if (bias)
-        as[2] = RandomMat(outch);
-
-    std::vector<ncnn::Mat> weights(0);
-
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, as);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_convolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
-    }
-
-    return ret;
-}
-
-static int test_convolutiondepthwise_2()
-{
-    static const int kdsp[7][4] = {
-        {1, 1, 1, 0},
-        {1, 1, 2, 0},
-        {2, 1, 1, 1},
-        {2, 1, 2, -233},
-        {3, 1, 1, 1},
-        {3, 1, 2, 1},
-        {3, 2, 1, -234},
-    };
-
-    for (int i = 0; i < 7; i++)
-    {
-        const int k = kdsp[i][0];
-        const int d = kdsp[i][1];
-        const int s = kdsp[i][2];
-        const int p = kdsp[i][3];
-
-        int ret = 0
-                  || test_convolutiondepthwise_dynamic(11, 10, 1, 1, k, d, s, p, 1, 1)
-                  || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 0, 1)
-                  || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 1, 2)
-                  || test_convolutiondepthwise_dynamic(11, 10, 3, 3, k, d, s, p, 0, 3)
-                  || test_convolutiondepthwise_dynamic(11, 10, 4, 2, k, d, s, p, 1, 2)
-                  || test_convolutiondepthwise_dynamic(11, 10, 4, 4, k, d, s, p, 0, 4)
-                  || test_convolutiondepthwise_dynamic(11, 10, 7, 7, k, d, s, p, 1, 7)
-                  || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 0, 2)
-                  || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 1, 8)
-                  || test_convolutiondepthwise_dynamic(11, 10, 12, 12, k, d, s, p, 0, 4)
-                  || test_convolutiondepthwise_dynamic(11, 10, 15, 15, k, d, s, p, 1, 15)
-                  || test_convolutiondepthwise_dynamic(11, 10, 16, 8, k, d, s, p, 0, 2)
-                  || test_convolutiondepthwise_dynamic(11, 10, 16, 16, k, d, s, p, 1, 16);
-
-        if (ret != 0)
-            return -1;
-    }
-
-    return 0;
-}
-
-#if NCNN_INT8
-static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false)
-{
-    ncnn::Mat a = RandomMat(w, h, c);
-
-    ncnn::ParamDict pd;
-    pd.set(0, outch);
-    pd.set(1, kernel);
-    pd.set(2, dilation);
-    pd.set(3, stride);
-    pd.set(4, pad);
-    pd.set(5, bias);
-    pd.set(6, outch / group * c / group * kernel * kernel * group);
-    pd.set(7, group);
-    pd.set(8, requant ? 101 : 1); // int8_scale_term
-
-    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
-    ncnn::Mat activation_params(2);
-    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
-    activation_params[1] = RandomFloat(0, 1);                                               // beta
-    pd.set(9, activation_type);
-    pd.set(10, activation_params);
-
-    std::vector<ncnn::Mat> weights(bias ? 5 : 4);
-    weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group);
-    ncnn::Mat weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group);
-    ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
-    ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
-    if (bias)
-    {
-        weights[1] = RandomMat(outch);
-        weights[2] = weight_scales;
-        weights[3] = input_scales;
-        weights[4] = top_scales;
-    }
-    else
-    {
-        weights[1] = weight_scales;
-        weights[2] = input_scales;
-        weights[3] = top_scales;
-    }
-
-    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]);
-    }
-
-    return ret;
-}
-
-static int test_convolutiondepthwise_1()
-{
-    static const int kdsp[16][4] = {
-        {1, 1, 1, 0},
-        {1, 1, 2, 0},
-        {2, 1, 1, 1},
-        {2, 1, 2, -233},
-        {3, 1, 1, 1},
-        {3, 1, 2, 1},
-        {3, 2, 1, 1},
-        {4, 1, 1, 2},
-        {4, 1, 2, -233},
-        {4, 2, 1, -234},
-        {5, 1, 1, -234},
-        {5, 1, 2, 2},
-        {5, 2, 2, 2},
-        {7, 1, 1, 3},
-        {7, 1, 2, 3},
-        {7, 2, 1, -233},
-    };
-
-    for (int i = 0; i < 16; i++)
-    {
-        const int k = kdsp[i][0];
-        const int d = kdsp[i][1];
-        const int s = kdsp[i][2];
-        const int p = kdsp[i][3];
-
-        int ret = 0
-                  || test_convolutiondepthwise_int8(15, 7, 1, 1, k, d, s, p, 1, 1)
-                  || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 0, 1)
-                  || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 1, 2)
-                  || test_convolutiondepthwise_int8(15, 7, 3, 3, k, d, s, p, 0, 3)
-                  || test_convolutiondepthwise_int8(15, 7, 4, 2, k, d, s, p, 1, 2)
-                  || test_convolutiondepthwise_int8(15, 7, 4, 4, k, d, s, p, 0, 4)
-                  || test_convolutiondepthwise_int8(15, 7, 7, 7, k, d, s, p, 1, 7)
-                  || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 0, 2)
-                  || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 8)
-                  || test_convolutiondepthwise_int8(15, 7, 12, 12, k, d, s, p, 0, 4)
-                  || test_convolutiondepthwise_int8(15, 7, 15, 15, k, d, s, p, 1, 15)
-                  || test_convolutiondepthwise_int8(15, 7, 16, 8, k, d, s, p, 0, 2)
-                  || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16);
-
-        if (ret != 0)
-            return -1;
-    }
-
-    for (int i = 0; i < 16; i++)
-    {
-        const int k = kdsp[i][0];
-        const int d = kdsp[i][1];
-        const int s = kdsp[i][2];
-        const int p = kdsp[i][3];
-
-        int ret = 0
-                  || test_convolutiondepthwise_int8(9, 7, 1, 1, k, d, s, p, 1, 1, true)
-                  || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 0, 1, true)
-                  || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 1, 2, true)
-                  || test_convolutiondepthwise_int8(9, 7, 3, 3, k, d, s, p, 0, 3, true)
-                  || test_convolutiondepthwise_int8(9, 7, 4, 2, k, d, s, p, 1, 2, true)
-                  || test_convolutiondepthwise_int8(9, 7, 4, 4, k, d, s, p, 0, 4, true)
-                  || test_convolutiondepthwise_int8(9, 7, 7, 7, k, d, s, p, 1, 7, true)
-                  || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 0, 2, true)
-                  || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 8, true)
-                  || test_convolutiondepthwise_int8(9, 7, 12, 12, k, d, s, p, 0, 4, true)
-                  || test_convolutiondepthwise_int8(9, 7, 15, 15, k, d, s, p, 1, 15, true)
-                  || test_convolutiondepthwise_int8(9, 7, 16, 8, k, d, s, p, 0, 2, true)
-                  || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true);
-
-        if (ret != 0)
-            return -1;
-    }
-
-    return 0;
-}
-#endif // NCNN_INT8
-
 int main()
 {
     SRAND(7767517);
 
-#if NCNN_INT8
-    return test_convolutiondepthwise_0() || test_convolutiondepthwise_1() || test_convolutiondepthwise_2();
-#else
-    return test_convolutiondepthwise_0() || test_convolutiondepthwise_2();
-#endif
+    return test_convolutiondepthwise_0();
 }
diff --git a/tests/test_convolutiondepthwise_1.cpp b/tests/test_convolutiondepthwise_1.cpp
new file mode 100644
index 00000000000..3d10a7a8e85
--- /dev/null
+++ b/tests/test_convolutiondepthwise_1.cpp
@@ -0,0 +1,236 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/convolutiondepthwise.h"
+#include "testutil.h"
+
+static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, 0);
+    pd.set(1, 0);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, 0);
+    pd.set(7, group);
+    pd.set(19, 1); // dynamic weight
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> as(bias ? 3 : 2);
+    as[0] = a;
+    as[1] = RandomMat(kernel, kernel, c / group, outch);
+    if (bias)
+        as[2] = RandomMat(outch);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, as);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_convolutiondepthwise_2()
+{
+    static const int kdsp[7][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, -233},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, -234},
+    };
+
+    for (int i = 0; i < 7; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_convolutiondepthwise_dynamic(11, 10, 1, 1, k, d, s, p, 1, 1)
+                  || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 0, 1)
+                  || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 1, 2)
+                  || test_convolutiondepthwise_dynamic(11, 10, 3, 3, k, d, s, p, 0, 3)
+                  || test_convolutiondepthwise_dynamic(11, 10, 4, 2, k, d, s, p, 1, 2)
+                  || test_convolutiondepthwise_dynamic(11, 10, 4, 4, k, d, s, p, 0, 4)
+                  || test_convolutiondepthwise_dynamic(11, 10, 7, 7, k, d, s, p, 1, 7)
+                  || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 0, 2)
+                  || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 1, 8)
+                  || test_convolutiondepthwise_dynamic(11, 10, 12, 12, k, d, s, p, 0, 4)
+                  || test_convolutiondepthwise_dynamic(11, 10, 15, 15, k, d, s, p, 1, 15)
+                  || test_convolutiondepthwise_dynamic(11, 10, 16, 8, k, d, s, p, 0, 2)
+                  || test_convolutiondepthwise_dynamic(11, 10, 16, 16, k, d, s, p, 1, 16);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+#if NCNN_INT8
+static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch / group * c / group * kernel * kernel * group);
+    pd.set(7, group);
+    pd.set(8, requant ? 101 : 1); // int8_scale_term
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 5 : 4);
+    weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group);
+    ncnn::Mat weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group);
+    ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
+    ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+    if (bias)
+    {
+        weights[1] = RandomMat(outch);
+        weights[2] = weight_scales;
+        weights[3] = input_scales;
+        weights[4] = top_scales;
+    }
+    else
+    {
+        weights[1] = weight_scales;
+        weights[2] = input_scales;
+        weights[3] = top_scales;
+    }
+
+    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
+    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_convolutiondepthwise_1()
+{
+    static const int kdsp[16][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, -233},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, 1},
+        {4, 1, 1, 2},
+        {4, 1, 2, -233},
+        {4, 2, 1, -234},
+        {5, 1, 1, -234},
+        {5, 1, 2, 2},
+        {5, 2, 2, 2},
+        {7, 1, 1, 3},
+        {7, 1, 2, 3},
+        {7, 2, 1, -233},
+    };
+
+    for (int i = 0; i < 16; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_convolutiondepthwise_int8(15, 7, 1, 1, k, d, s, p, 1, 1)
+                  || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 0, 1)
+                  || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 1, 2)
+                  || test_convolutiondepthwise_int8(15, 7, 3, 3, k, d, s, p, 0, 3)
+                  || test_convolutiondepthwise_int8(15, 7, 4, 2, k, d, s, p, 1, 2)
+                  || test_convolutiondepthwise_int8(15, 7, 4, 4, k, d, s, p, 0, 4)
+                  || test_convolutiondepthwise_int8(15, 7, 7, 7, k, d, s, p, 1, 7)
+                  || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 0, 2)
+                  || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 8)
+                  || test_convolutiondepthwise_int8(15, 7, 12, 12, k, d, s, p, 0, 4)
+                  || test_convolutiondepthwise_int8(15, 7, 15, 15, k, d, s, p, 1, 15)
+                  || test_convolutiondepthwise_int8(15, 7, 16, 8, k, d, s, p, 0, 2)
+                  || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    for (int i = 0; i < 16; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_convolutiondepthwise_int8(9, 7, 1, 1, k, d, s, p, 1, 1, true)
+                  || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 0, 1, true)
+                  || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 1, 2, true)
+                  || test_convolutiondepthwise_int8(9, 7, 3, 3, k, d, s, p, 0, 3, true)
+                  || test_convolutiondepthwise_int8(9, 7, 4, 2, k, d, s, p, 1, 2, true)
+                  || test_convolutiondepthwise_int8(9, 7, 4, 4, k, d, s, p, 0, 4, true)
+                  || test_convolutiondepthwise_int8(9, 7, 7, 7, k, d, s, p, 1, 7, true)
+                  || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 0, 2, true)
+                  || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 8, true)
+                  || test_convolutiondepthwise_int8(9, 7, 12, 12, k, d, s, p, 0, 4, true)
+                  || test_convolutiondepthwise_int8(9, 7, 15, 15, k, d, s, p, 1, 15, true)
+                  || test_convolutiondepthwise_int8(9, 7, 16, 8, k, d, s, p, 0, 2, true)
+                  || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
+int main()
+{
+    SRAND(7767517);
+
+#if NCNN_INT8
+    return test_convolutiondepthwise_1() || test_convolutiondepthwise_2();
+#else
+    return test_convolutiondepthwise_2();
+#endif
+}
diff --git a/tests/test_crop.cpp b/tests/test_crop.cpp
index caa2876c499..b2a29778fec 100644
--- a/tests/test_crop.cpp
+++ b/tests/test_crop.cpp
@@ -42,112 +42,6 @@ static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset,
     return ret;
 }
 
-static ncnn::Mat IntArrayMat(int a0)
-{
-    ncnn::Mat m(1);
-    int* p = m;
-    p[0] = a0;
-    return m;
-}
-
-static ncnn::Mat IntArrayMat(int a0, int a1)
-{
-    ncnn::Mat m(2);
-    int* p = m;
-    p[0] = a0;
-    p[1] = a1;
-    return m;
-}
-
-static ncnn::Mat IntArrayMat(int a0, int a1, int a2)
-{
-    ncnn::Mat m(3);
-    int* p = m;
-    p[0] = a0;
-    p[1] = a1;
-    p[2] = a2;
-    return m;
-}
-
-static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3)
-{
-    ncnn::Mat m(4);
-    int* p = m;
-    p[0] = a0;
-    p[1] = a1;
-    p[2] = a2;
-    p[3] = a3;
-    return m;
-}
-
-static void print_int_array(const ncnn::Mat& a)
-{
-    const int* pa = a;
-
-    fprintf(stderr, "[");
-    for (int i = 0; i < a.w; i++)
-    {
-        fprintf(stderr, " %d", pa[i]);
-    }
-    fprintf(stderr, " ]");
-}
-
-static int test_crop(const ncnn::Mat& a, const ncnn::Mat& starts, const ncnn::Mat& ends, const ncnn::Mat& axes)
-{
-    ncnn::ParamDict pd;
-    pd.set(9, starts); // starts
-    pd.set(10, ends);  // ends
-    pd.set(11, axes);  // axes
-
-    std::vector<ncnn::Mat> weights(0);
-
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, a);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
-        fprintf(stderr, " starts=");
-        print_int_array(starts);
-        fprintf(stderr, " ends=");
-        print_int_array(ends);
-        fprintf(stderr, " axes=");
-        print_int_array(axes);
-        fprintf(stderr, "\n");
-    }
-
-    return ret;
-}
-
-static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, const ncnn::Mat& ref)
-{
-    ncnn::ParamDict pd;
-    pd.set(0, woffset);
-    pd.set(1, hoffset);
-    pd.set(13, doffset);
-    pd.set(2, coffset);
-    pd.set(3, 0);  // outw
-    pd.set(4, 0);  // outh
-    pd.set(14, 0); // outd
-    pd.set(5, 0);  // outc
-    pd.set(6, 0);  // woffset2
-    pd.set(7, 0);  // hoffset2
-    pd.set(15, 0); // doffset2
-    pd.set(8, 0);  // coffset2
-
-    std::vector<ncnn::Mat> weights(0);
-
-    std::vector<ncnn::Mat> ab(2);
-    ab[0] = a;
-    ab[1] = ref;
-
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, ab);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d ref.dims=%d ref=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, ref.dims, ref.w, ref.h, ref.d, ref.c);
-    }
-
-    return ret;
-}
-
 static int test_crop_0(const ncnn::Mat& a)
 {
     return 0
@@ -161,30 +55,6 @@ static int test_crop_0(const ncnn::Mat& a)
            || test_crop(a, 16, 0, 0, 0, -233, 0, 0, 0, 7, 0, 0, 0);
 }
 
-static int test_crop_1(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(16), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(-1))
-           || test_crop(a, IntArrayMat(16), IntArrayMat(16 + 12), ncnn::Mat())
-           || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(-1))
-           || test_crop(a, IntArrayMat(16), IntArrayMat(-16 + 1), ncnn::Mat());
-}
-
-static int test_crop_2(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, 0, 0, 0, 0, a)
-           || test_crop(a, 0, 0, 0, 0, ncnn::Mat(27))
-
-           || test_crop(a, 11, 0, 0, 0, ncnn::Mat(7))
-           || test_crop(a, 12, 0, 0, 0, ncnn::Mat(12))
-           || test_crop(a, 16, 0, 0, 0, ncnn::Mat(16));
-}
-
 static int test_crop_3(const ncnn::Mat& a)
 {
     return 0
@@ -220,52 +90,6 @@ static int test_crop_3(const ncnn::Mat& a)
            || test_crop(a, 4, 8, 0, 0, -233, -233, 0, 0, 6, 12, 0, 0);
 }
 
-static int test_crop_4(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(5, 11), IntArrayMat(-233, -233), IntArrayMat(0, 1))
-
-           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-2))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(8), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(9), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-1))
-
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 10), IntArrayMat(0, -1))
-
-           || test_crop(a, IntArrayMat(11), IntArrayMat(-16 + 1), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(-7 + 1), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(-12 + 1), IntArrayMat(-2))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-1))
-
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-2, -1));
-}
-
-static int test_crop_5(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, 0, 0, 0, 0, a)
-
-           || test_crop(a, 0, 12, 0, 0, ncnn::Mat(8, 7))
-           || test_crop(a, 5, 0, 0, 0, ncnn::Mat(7, 27))
-
-           || test_crop(a, 5, 11, 0, 0, ncnn::Mat(5, 12))
-           || test_crop(a, 6, 12, 0, 0, ncnn::Mat(4, 16))
-           || test_crop(a, 4, 8, 0, 0, ncnn::Mat(6, 7));
-}
-
 static int test_crop_6(const ncnn::Mat& a)
 {
     return 0
@@ -338,94 +162,6 @@ static int test_crop_6(const ncnn::Mat& a)
            || test_crop(a, 4, 4, 0, 8, -233, -233, 0, -233, 6, 2, 0, 12);
 }
 
-static int test_crop_7(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-1))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(0, -1))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(6, 6), IntArrayMat(-233, -233), IntArrayMat(1, -1))
-           || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, -1))
-
-           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 7), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 12), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 16), IntArrayMat(0))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(13), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(12), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(11), IntArrayMat(-2))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(12), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(11), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(-1))
-
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 16, 12), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 12, 13), IntArrayMat(0, -2))
-
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 16, 13), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 11), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 7, 12), IntArrayMat(0, -1))
-
-           || test_crop(a, IntArrayMat(5, 4), IntArrayMat(12, 12), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(6, 3), IntArrayMat(13, 13), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(4, 2), IntArrayMat(11, 11), IntArrayMat(-2, -1))
-
-           || test_crop(a, IntArrayMat(11, 5, 2), IntArrayMat(11 + 7, 11, 11), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(12, 6, 4), IntArrayMat(12 + 16, 12, 12), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(8, 4, 3), IntArrayMat(8 + 12, 13, 13), IntArrayMat(-3, -2, -1))
-
-           || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-3))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-2))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-4 + 1), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-6 + 1), IntArrayMat(-1))
-
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-3, -2))
-
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-3, -1))
-
-           || test_crop(a, IntArrayMat(5, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(6, 4), IntArrayMat(-4 + 1, -4 + 1), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-6 + 1, -6 + 1), IntArrayMat(-2, -1))
-
-           || test_crop(a, IntArrayMat(11, 5, 4), IntArrayMat(-7 + 1, -5 + 1, -5 + 1), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(12, 6, 3), IntArrayMat(-12 + 1, -6 + 1, -6 + 1), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(8, 4, 2), IntArrayMat(-16 + 1, -4 + 1, -4 + 1), IntArrayMat(-3, -2, -1));
-}
-
-static int test_crop_8(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, 0, 0, 0, 0, a)
-
-           || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6))
-           || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8))
-           || test_crop(a, 5, 2, 0, 0, ncnn::Mat(6, 3))
-           || test_crop(a, 6, 3, 0, 0, ncnn::Mat(8, 4))
-           || test_crop(a, 4, 4, 0, 0, ncnn::Mat(7, 5))
-
-           || test_crop(a, 5, 3, 0, 11, ncnn::Mat(7, 3, 7))
-           || test_crop(a, 6, 4, 0, 12, ncnn::Mat(6, 4, 12))
-           || test_crop(a, 4, 2, 0, 8, ncnn::Mat(5, 5, 16));
-}
-
 static int test_crop_9(const ncnn::Mat& a)
 {
     return 0
@@ -524,171 +260,6 @@ static int test_crop_9(const ncnn::Mat& a)
            || test_crop(a, 3, 3, 3, 8, -233, -233, -233, -233, 3, 3, 3, 12);
 }
 
-static int test_crop_10(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-2))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(3))
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(-1))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(-4, -2))
-           || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 3))
-           || test_crop(a, IntArrayMat(5, 5), IntArrayMat(-233, -233), IntArrayMat(1, 3))
-           || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(2, 3))
-           || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 3))
-           || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3))
-           || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3))
-           || test_crop(a, IntArrayMat(4, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3))
-           || test_crop(a, IntArrayMat(6, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3))
-           || test_crop(a, IntArrayMat(11, 5, 5, 5), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3))
-           || test_crop(a, IntArrayMat(8, 4, 4, 4), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3))
-           || test_crop(a, IntArrayMat(12, 6, 6, 6), IntArrayMat(-233, -233, -233, -233), IntArrayMat(-4, -3, -2, -1))
-
-           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-4))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(13), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-3))
-
-           || test_crop(a, IntArrayMat(3), IntArrayMat(12), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(-2))
-
-           || test_crop(a, IntArrayMat(1), IntArrayMat(8), IntArrayMat(3))
-           || test_crop(a, IntArrayMat(2), IntArrayMat(7), IntArrayMat(3))
-           || test_crop(a, IntArrayMat(3), IntArrayMat(6), IntArrayMat(-1))
-
-           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 13), IntArrayMat(-4, -3))
-
-           || test_crop(a, IntArrayMat(11, 4), IntArrayMat(11 + 12, 13), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(12, 3), IntArrayMat(12 + 16, 11), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(8, 2), IntArrayMat(8 + 7, 12), IntArrayMat(-4, -2))
-
-           || test_crop(a, IntArrayMat(11, 1), IntArrayMat(11 + 16, 5), IntArrayMat(0, 3))
-           || test_crop(a, IntArrayMat(12, 2), IntArrayMat(12 + 7, 6), IntArrayMat(0, 3))
-           || test_crop(a, IntArrayMat(8, 3), IntArrayMat(8 + 12, 7), IntArrayMat(-4, -1))
-
-           || test_crop(a, IntArrayMat(3, 3), IntArrayMat(13, 4), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(4, 2), IntArrayMat(12, 3), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(5, 1), IntArrayMat(11, 2), IntArrayMat(-3, -2))
-
-           || test_crop(a, IntArrayMat(5, 5), IntArrayMat(11, 8), IntArrayMat(1, 3))
-           || test_crop(a, IntArrayMat(4, 6), IntArrayMat(12, 9), IntArrayMat(1, 3))
-           || test_crop(a, IntArrayMat(3, 4), IntArrayMat(13, 7), IntArrayMat(-3, -1))
-
-           || test_crop(a, IntArrayMat(2, 3), IntArrayMat(12, 9), IntArrayMat(2, 3))
-           || test_crop(a, IntArrayMat(3, 2), IntArrayMat(11, 7), IntArrayMat(2, 3))
-           || test_crop(a, IntArrayMat(4, 1), IntArrayMat(10, 8), IntArrayMat(-2, -1))
-
-           || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(11 + 6, 9, 9), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 1, 10, 10), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(8 + 3, 11, 11), IntArrayMat(-4, -3, -2))
-
-           || test_crop(a, IntArrayMat(11, 4, 4), IntArrayMat(11 + 12, 12, 12), IntArrayMat(0, 1, 3))
-           || test_crop(a, IntArrayMat(12, 5, 5), IntArrayMat(12 + 8, 11, 11), IntArrayMat(0, 1, 3))
-           || test_crop(a, IntArrayMat(8, 6, 6), IntArrayMat(8 + 4, 13, 13), IntArrayMat(-4, -3, -1))
-
-           || test_crop(a, IntArrayMat(11, 1, 4), IntArrayMat(11 + 5, 12, 12), IntArrayMat(0, 2, 3))
-           || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 3, 11, 11), IntArrayMat(0, 2, 3))
-           || test_crop(a, IntArrayMat(8, 2, 5), IntArrayMat(8 + 2, 10, 10), IntArrayMat(-4, -2, -1))
-
-           || test_crop(a, IntArrayMat(1, 1, 1), IntArrayMat(7, 7, 7), IntArrayMat(1, 2, 3))
-           || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(8, 9, 10), IntArrayMat(1, 2, 3))
-           || test_crop(a, IntArrayMat(3, 3, 3), IntArrayMat(11, 12, 13), IntArrayMat(-3, -2, -1))
-
-           || test_crop(a, IntArrayMat(11, 2, 3, 6), IntArrayMat(11 + 11, 10, 12, 11), IntArrayMat(0, 1, 2, 3))
-           || test_crop(a, IntArrayMat(12, 3, 4, 5), IntArrayMat(12 + 12, 9, 11, 13), IntArrayMat(0, 1, 2, 3))
-           || test_crop(a, IntArrayMat(8, 4, 5, 4), IntArrayMat(8 + 8, 8, 10, 12), IntArrayMat(-4, -3, -2, -1))
-
-           || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0))
-           || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-4))
-
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1))
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-3))
-
-           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2))
-           || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(-2))
-
-           || test_crop(a, IntArrayMat(1), IntArrayMat(-5 + 1), IntArrayMat(3))
-           || test_crop(a, IntArrayMat(2), IntArrayMat(-4 + 1), IntArrayMat(3))
-           || test_crop(a, IntArrayMat(3), IntArrayMat(-3 + 1), IntArrayMat(-1))
-
-           || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-7 + 1, -3 + 1), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-12 + 1, -4 + 1), IntArrayMat(0, 1))
-           || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-4, -3))
-
-           || test_crop(a, IntArrayMat(11, 1), IntArrayMat(-12 + 1, -5 + 1), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(12, 2), IntArrayMat(-16 + 1, -4 + 1), IntArrayMat(0, 2))
-           || test_crop(a, IntArrayMat(8, 3), IntArrayMat(-7 + 1, -6 + 1), IntArrayMat(-4, -2))
-
-           || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-12 + 1, -2 + 1), IntArrayMat(0, 3))
-           || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-16 + 1, -3 + 1), IntArrayMat(0, 3))
-           || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-4, -1))
-
-           || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -2 + 1), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(3, 4), IntArrayMat(-2 + 1, -3 + 1), IntArrayMat(1, 2))
-           || test_crop(a, IntArrayMat(4, 5), IntArrayMat(-3 + 1, -4 + 1), IntArrayMat(-3, -2))
-
-           || test_crop(a, IntArrayMat(3, 2), IntArrayMat(-2 + 1, -4 + 1), IntArrayMat(1, 3))
-           || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-3 + 1, -2 + 1), IntArrayMat(1, 3))
-           || test_crop(a, IntArrayMat(5, 4), IntArrayMat(-4 + 1, -3 + 1), IntArrayMat(-3, -1))
-
-           || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -6 + 1), IntArrayMat(2, 3))
-           || test_crop(a, IntArrayMat(1, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(2, 3))
-           || test_crop(a, IntArrayMat(3, 1), IntArrayMat(-6 + 1, -4 + 1), IntArrayMat(-2, -1))
-
-           || test_crop(a, IntArrayMat(11, 3, 3), IntArrayMat(-7 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(12, 4, 4), IntArrayMat(-12 + 1, -4 + 1, -3 + 1), IntArrayMat(0, 1, 2))
-           || test_crop(a, IntArrayMat(8, 5, 5), IntArrayMat(-16 + 1, -5 + 1, -5 + 1), IntArrayMat(-4, -3, -2))
-
-           || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(-7 + 1, -5 + 1, -4 + 1), IntArrayMat(0, 1, 3))
-           || test_crop(a, IntArrayMat(12, 1, 1), IntArrayMat(-12 + 1, -6 + 1, -5 + 1), IntArrayMat(0, 1, 3))
-           || test_crop(a, IntArrayMat(8, 3, 3), IntArrayMat(-16 + 1, -4 + 1, -6 + 1), IntArrayMat(-4, -3, -1))
-
-           || test_crop(a, IntArrayMat(11, 2, 5), IntArrayMat(-7 + 1, -2 + 1, -5 + 1), IntArrayMat(0, 2, 3))
-           || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(-12 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 2, 3))
-           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-16 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -2, -1))
-
-           || test_crop(a, IntArrayMat(1, 3, 3), IntArrayMat(-3 + 1, -6 + 1, -4 + 1), IntArrayMat(1, 2, 3))
-           || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(-4 + 1, -4 + 1, -5 + 1), IntArrayMat(1, 2, 3))
-           || test_crop(a, IntArrayMat(3, 1, 1), IntArrayMat(-5 + 1, -5 + 1, -6 + 1), IntArrayMat(-3, -2, -1))
-
-           || test_crop(a, IntArrayMat(11, 3, 4, 4), IntArrayMat(-7 + 1, -3 + 1, -2 + 1, -4 + 1), IntArrayMat(0, 1, 2, 3))
-           || test_crop(a, IntArrayMat(12, 4, 5, 3), IntArrayMat(-12 + 1, -4 + 1, -3 + 1, -5 + 1), IntArrayMat(0, 1, 2, 3))
-           || test_crop(a, IntArrayMat(8, 5, 6, 2), IntArrayMat(-16 + 1, -5 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -3, -2, -1));
-}
-
-static int test_crop_11(const ncnn::Mat& a)
-{
-    return 0
-           || test_crop(a, 0, 0, 0, 0, a)
-
-           || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6, 6))
-           || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8, 8))
-           || test_crop(a, 5, 5, 5, 0, ncnn::Mat(6, 6, 6))
-           || test_crop(a, 6, 6, 6, 0, ncnn::Mat(8, 8, 8))
-           || test_crop(a, 4, 4, 4, 0, ncnn::Mat(5, 5, 5))
-
-           || test_crop(a, 3, 3, 3, 11, ncnn::Mat(3, 3, 3, 7))
-           || test_crop(a, 4, 4, 4, 12, ncnn::Mat(6, 6, 6, 12))
-           || test_crop(a, 5, 5, 5, 8, ncnn::Mat(8, 8, 8, 16));
-}
-
 int main()
 {
     SRAND(776757);
@@ -697,37 +268,13 @@ int main()
            || test_crop_0(RandomMat(112))
            || test_crop_0(RandomMat(126))
            || test_crop_0(RandomMat(127))
-           || test_crop_1(RandomMat(112))
-           || test_crop_1(RandomMat(126))
-           || test_crop_1(RandomMat(127))
-           || test_crop_2(RandomMat(112))
-           || test_crop_2(RandomMat(126))
-           || test_crop_2(RandomMat(127))
            || test_crop_3(RandomMat(20, 48))
            || test_crop_3(RandomMat(15, 36))
            || test_crop_3(RandomMat(16, 33))
-           || test_crop_4(RandomMat(20, 48))
-           || test_crop_4(RandomMat(15, 36))
-           || test_crop_4(RandomMat(16, 33))
-           || test_crop_5(RandomMat(20, 48))
-           || test_crop_5(RandomMat(15, 36))
-           || test_crop_5(RandomMat(16, 33))
            || test_crop_6(RandomMat(20, 20, 48))
            || test_crop_6(RandomMat(15, 15, 36))
            || test_crop_6(RandomMat(16, 16, 33))
-           || test_crop_7(RandomMat(20, 20, 48))
-           || test_crop_7(RandomMat(15, 15, 36))
-           || test_crop_7(RandomMat(16, 16, 33))
-           || test_crop_8(RandomMat(20, 20, 48))
-           || test_crop_8(RandomMat(15, 15, 36))
-           || test_crop_8(RandomMat(16, 16, 33))
            || test_crop_9(RandomMat(20, 20, 20, 48))
            || test_crop_9(RandomMat(15, 15, 15, 36))
-           || test_crop_9(RandomMat(16, 16, 16, 33))
-           || test_crop_10(RandomMat(20, 20, 20, 48))
-           || test_crop_10(RandomMat(15, 15, 15, 36))
-           || test_crop_10(RandomMat(16, 16, 16, 33))
-           || test_crop_11(RandomMat(20, 20, 20, 48))
-           || test_crop_11(RandomMat(15, 15, 15, 36))
-           || test_crop_11(RandomMat(16, 16, 16, 33));
+           || test_crop_9(RandomMat(16, 16, 16, 33));
 }
diff --git a/tests/test_crop_1.cpp b/tests/test_crop_1.cpp
new file mode 100644
index 00000000000..c875a51c7fa
--- /dev/null
+++ b/tests/test_crop_1.cpp
@@ -0,0 +1,377 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/crop.h"
+#include "testutil.h"
+
+static ncnn::Mat IntArrayMat(int a0)
+{
+    ncnn::Mat m(1);
+    int* p = m;
+    p[0] = a0;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1)
+{
+    ncnn::Mat m(2);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2)
+{
+    ncnn::Mat m(3);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3)
+{
+    ncnn::Mat m(4);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    p[3] = a3;
+    return m;
+}
+
+static void print_int_array(const ncnn::Mat& a)
+{
+    const int* pa = a;
+
+    fprintf(stderr, "[");
+    for (int i = 0; i < a.w; i++)
+    {
+        fprintf(stderr, " %d", pa[i]);
+    }
+    fprintf(stderr, " ]");
+}
+
+static int test_crop(const ncnn::Mat& a, const ncnn::Mat& starts, const ncnn::Mat& ends, const ncnn::Mat& axes)
+{
+    ncnn::ParamDict pd;
+    pd.set(9, starts); // starts
+    pd.set(10, ends);  // ends
+    pd.set(11, axes);  // axes
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
+        fprintf(stderr, " starts=");
+        print_int_array(starts);
+        fprintf(stderr, " ends=");
+        print_int_array(ends);
+        fprintf(stderr, " axes=");
+        print_int_array(axes);
+        fprintf(stderr, "\n");
+    }
+
+    return ret;
+}
+
+static int test_crop_1(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(16), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(-1))
+           || test_crop(a, IntArrayMat(16), IntArrayMat(16 + 12), ncnn::Mat())
+           || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(-1))
+           || test_crop(a, IntArrayMat(16), IntArrayMat(-16 + 1), ncnn::Mat());
+}
+
+static int test_crop_4(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(5, 11), IntArrayMat(-233, -233), IntArrayMat(0, 1))
+
+           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-2))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(8), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(9), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-1))
+
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 10), IntArrayMat(0, -1))
+
+           || test_crop(a, IntArrayMat(11), IntArrayMat(-16 + 1), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(-7 + 1), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(-12 + 1), IntArrayMat(-2))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-1))
+
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-2, -1));
+}
+
+static int test_crop_7(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-1))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(0, -1))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(6, 6), IntArrayMat(-233, -233), IntArrayMat(1, -1))
+           || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, -1))
+
+           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 7), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 12), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 16), IntArrayMat(0))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(13), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(12), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(11), IntArrayMat(-2))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(12), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(11), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(-1))
+
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 16, 12), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 12, 13), IntArrayMat(0, -2))
+
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 16, 13), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 11), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 7, 12), IntArrayMat(0, -1))
+
+           || test_crop(a, IntArrayMat(5, 4), IntArrayMat(12, 12), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(6, 3), IntArrayMat(13, 13), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(4, 2), IntArrayMat(11, 11), IntArrayMat(-2, -1))
+
+           || test_crop(a, IntArrayMat(11, 5, 2), IntArrayMat(11 + 7, 11, 11), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(12, 6, 4), IntArrayMat(12 + 16, 12, 12), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(8, 4, 3), IntArrayMat(8 + 12, 13, 13), IntArrayMat(-3, -2, -1))
+
+           || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-3))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-2))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-4 + 1), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-6 + 1), IntArrayMat(-1))
+
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-3, -2))
+
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-3, -1))
+
+           || test_crop(a, IntArrayMat(5, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(6, 4), IntArrayMat(-4 + 1, -4 + 1), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-6 + 1, -6 + 1), IntArrayMat(-2, -1))
+
+           || test_crop(a, IntArrayMat(11, 5, 4), IntArrayMat(-7 + 1, -5 + 1, -5 + 1), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(12, 6, 3), IntArrayMat(-12 + 1, -6 + 1, -6 + 1), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(8, 4, 2), IntArrayMat(-16 + 1, -4 + 1, -4 + 1), IntArrayMat(-3, -2, -1));
+}
+
+static int test_crop_10(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-2))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(3))
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(-1))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(-4, -2))
+           || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 3))
+           || test_crop(a, IntArrayMat(5, 5), IntArrayMat(-233, -233), IntArrayMat(1, 3))
+           || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(2, 3))
+           || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 3))
+           || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3))
+           || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3))
+           || test_crop(a, IntArrayMat(4, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3))
+           || test_crop(a, IntArrayMat(6, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3))
+           || test_crop(a, IntArrayMat(11, 5, 5, 5), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3))
+           || test_crop(a, IntArrayMat(8, 4, 4, 4), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3))
+           || test_crop(a, IntArrayMat(12, 6, 6, 6), IntArrayMat(-233, -233, -233, -233), IntArrayMat(-4, -3, -2, -1))
+
+           || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-4))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(13), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-3))
+
+           || test_crop(a, IntArrayMat(3), IntArrayMat(12), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(-2))
+
+           || test_crop(a, IntArrayMat(1), IntArrayMat(8), IntArrayMat(3))
+           || test_crop(a, IntArrayMat(2), IntArrayMat(7), IntArrayMat(3))
+           || test_crop(a, IntArrayMat(3), IntArrayMat(6), IntArrayMat(-1))
+
+           || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 13), IntArrayMat(-4, -3))
+
+           || test_crop(a, IntArrayMat(11, 4), IntArrayMat(11 + 12, 13), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(12, 3), IntArrayMat(12 + 16, 11), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(8, 2), IntArrayMat(8 + 7, 12), IntArrayMat(-4, -2))
+
+           || test_crop(a, IntArrayMat(11, 1), IntArrayMat(11 + 16, 5), IntArrayMat(0, 3))
+           || test_crop(a, IntArrayMat(12, 2), IntArrayMat(12 + 7, 6), IntArrayMat(0, 3))
+           || test_crop(a, IntArrayMat(8, 3), IntArrayMat(8 + 12, 7), IntArrayMat(-4, -1))
+
+           || test_crop(a, IntArrayMat(3, 3), IntArrayMat(13, 4), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(4, 2), IntArrayMat(12, 3), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(5, 1), IntArrayMat(11, 2), IntArrayMat(-3, -2))
+
+           || test_crop(a, IntArrayMat(5, 5), IntArrayMat(11, 8), IntArrayMat(1, 3))
+           || test_crop(a, IntArrayMat(4, 6), IntArrayMat(12, 9), IntArrayMat(1, 3))
+           || test_crop(a, IntArrayMat(3, 4), IntArrayMat(13, 7), IntArrayMat(-3, -1))
+
+           || test_crop(a, IntArrayMat(2, 3), IntArrayMat(12, 9), IntArrayMat(2, 3))
+           || test_crop(a, IntArrayMat(3, 2), IntArrayMat(11, 7), IntArrayMat(2, 3))
+           || test_crop(a, IntArrayMat(4, 1), IntArrayMat(10, 8), IntArrayMat(-2, -1))
+
+           || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(11 + 6, 9, 9), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 1, 10, 10), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(8 + 3, 11, 11), IntArrayMat(-4, -3, -2))
+
+           || test_crop(a, IntArrayMat(11, 4, 4), IntArrayMat(11 + 12, 12, 12), IntArrayMat(0, 1, 3))
+           || test_crop(a, IntArrayMat(12, 5, 5), IntArrayMat(12 + 8, 11, 11), IntArrayMat(0, 1, 3))
+           || test_crop(a, IntArrayMat(8, 6, 6), IntArrayMat(8 + 4, 13, 13), IntArrayMat(-4, -3, -1))
+
+           || test_crop(a, IntArrayMat(11, 1, 4), IntArrayMat(11 + 5, 12, 12), IntArrayMat(0, 2, 3))
+           || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 3, 11, 11), IntArrayMat(0, 2, 3))
+           || test_crop(a, IntArrayMat(8, 2, 5), IntArrayMat(8 + 2, 10, 10), IntArrayMat(-4, -2, -1))
+
+           || test_crop(a, IntArrayMat(1, 1, 1), IntArrayMat(7, 7, 7), IntArrayMat(1, 2, 3))
+           || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(8, 9, 10), IntArrayMat(1, 2, 3))
+           || test_crop(a, IntArrayMat(3, 3, 3), IntArrayMat(11, 12, 13), IntArrayMat(-3, -2, -1))
+
+           || test_crop(a, IntArrayMat(11, 2, 3, 6), IntArrayMat(11 + 11, 10, 12, 11), IntArrayMat(0, 1, 2, 3))
+           || test_crop(a, IntArrayMat(12, 3, 4, 5), IntArrayMat(12 + 12, 9, 11, 13), IntArrayMat(0, 1, 2, 3))
+           || test_crop(a, IntArrayMat(8, 4, 5, 4), IntArrayMat(8 + 8, 8, 10, 12), IntArrayMat(-4, -3, -2, -1))
+
+           || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0))
+           || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-4))
+
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1))
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-3))
+
+           || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2))
+           || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(-2))
+
+           || test_crop(a, IntArrayMat(1), IntArrayMat(-5 + 1), IntArrayMat(3))
+           || test_crop(a, IntArrayMat(2), IntArrayMat(-4 + 1), IntArrayMat(3))
+           || test_crop(a, IntArrayMat(3), IntArrayMat(-3 + 1), IntArrayMat(-1))
+
+           || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-7 + 1, -3 + 1), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-12 + 1, -4 + 1), IntArrayMat(0, 1))
+           || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-4, -3))
+
+           || test_crop(a, IntArrayMat(11, 1), IntArrayMat(-12 + 1, -5 + 1), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(12, 2), IntArrayMat(-16 + 1, -4 + 1), IntArrayMat(0, 2))
+           || test_crop(a, IntArrayMat(8, 3), IntArrayMat(-7 + 1, -6 + 1), IntArrayMat(-4, -2))
+
+           || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-12 + 1, -2 + 1), IntArrayMat(0, 3))
+           || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-16 + 1, -3 + 1), IntArrayMat(0, 3))
+           || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-4, -1))
+
+           || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -2 + 1), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(3, 4), IntArrayMat(-2 + 1, -3 + 1), IntArrayMat(1, 2))
+           || test_crop(a, IntArrayMat(4, 5), IntArrayMat(-3 + 1, -4 + 1), IntArrayMat(-3, -2))
+
+           || test_crop(a, IntArrayMat(3, 2), IntArrayMat(-2 + 1, -4 + 1), IntArrayMat(1, 3))
+           || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-3 + 1, -2 + 1), IntArrayMat(1, 3))
+           || test_crop(a, IntArrayMat(5, 4), IntArrayMat(-4 + 1, -3 + 1), IntArrayMat(-3, -1))
+
+           || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -6 + 1), IntArrayMat(2, 3))
+           || test_crop(a, IntArrayMat(1, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(2, 3))
+           || test_crop(a, IntArrayMat(3, 1), IntArrayMat(-6 + 1, -4 + 1), IntArrayMat(-2, -1))
+
+           || test_crop(a, IntArrayMat(11, 3, 3), IntArrayMat(-7 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(12, 4, 4), IntArrayMat(-12 + 1, -4 + 1, -3 + 1), IntArrayMat(0, 1, 2))
+           || test_crop(a, IntArrayMat(8, 5, 5), IntArrayMat(-16 + 1, -5 + 1, -5 + 1), IntArrayMat(-4, -3, -2))
+
+           || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(-7 + 1, -5 + 1, -4 + 1), IntArrayMat(0, 1, 3))
+           || test_crop(a, IntArrayMat(12, 1, 1), IntArrayMat(-12 + 1, -6 + 1, -5 + 1), IntArrayMat(0, 1, 3))
+           || test_crop(a, IntArrayMat(8, 3, 3), IntArrayMat(-16 + 1, -4 + 1, -6 + 1), IntArrayMat(-4, -3, -1))
+
+           || test_crop(a, IntArrayMat(11, 2, 5), IntArrayMat(-7 + 1, -2 + 1, -5 + 1), IntArrayMat(0, 2, 3))
+           || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(-12 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 2, 3))
+           || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-16 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -2, -1))
+
+           || test_crop(a, IntArrayMat(1, 3, 3), IntArrayMat(-3 + 1, -6 + 1, -4 + 1), IntArrayMat(1, 2, 3))
+           || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(-4 + 1, -4 + 1, -5 + 1), IntArrayMat(1, 2, 3))
+           || test_crop(a, IntArrayMat(3, 1, 1), IntArrayMat(-5 + 1, -5 + 1, -6 + 1), IntArrayMat(-3, -2, -1))
+
+           || test_crop(a, IntArrayMat(11, 3, 4, 4), IntArrayMat(-7 + 1, -3 + 1, -2 + 1, -4 + 1), IntArrayMat(0, 1, 2, 3))
+           || test_crop(a, IntArrayMat(12, 4, 5, 3), IntArrayMat(-12 + 1, -4 + 1, -3 + 1, -5 + 1), IntArrayMat(0, 1, 2, 3))
+           || test_crop(a, IntArrayMat(8, 5, 6, 2), IntArrayMat(-16 + 1, -5 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -3, -2, -1));
+}
+
+int main()
+{
+    SRAND(776757);
+
+    return 0
+           || test_crop_1(RandomMat(112))
+           || test_crop_1(RandomMat(126))
+           || test_crop_1(RandomMat(127))
+           || test_crop_4(RandomMat(20, 48))
+           || test_crop_4(RandomMat(15, 36))
+           || test_crop_4(RandomMat(16, 33))
+           || test_crop_7(RandomMat(20, 20, 48))
+           || test_crop_7(RandomMat(15, 15, 36))
+           || test_crop_7(RandomMat(16, 16, 33))
+           || test_crop_10(RandomMat(20, 20, 20, 48))
+           || test_crop_10(RandomMat(15, 15, 15, 36))
+           || test_crop_10(RandomMat(16, 16, 16, 33));
+}
diff --git a/tests/test_crop_2.cpp b/tests/test_crop_2.cpp
new file mode 100644
index 00000000000..287634b973e
--- /dev/null
+++ b/tests/test_crop_2.cpp
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/crop.h"
+#include "testutil.h"
+
+static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, const ncnn::Mat& ref)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, woffset);
+    pd.set(1, hoffset);
+    pd.set(13, doffset);
+    pd.set(2, coffset);
+    pd.set(3, 0);  // outw
+    pd.set(4, 0);  // outh
+    pd.set(14, 0); // outd
+    pd.set(5, 0);  // outc
+    pd.set(6, 0);  // woffset2
+    pd.set(7, 0);  // hoffset2
+    pd.set(15, 0); // doffset2
+    pd.set(8, 0);  // coffset2
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> ab(2);
+    ab[0] = a;
+    ab[1] = ref;
+
+    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, ab);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d ref.dims=%d ref=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, ref.dims, ref.w, ref.h, ref.d, ref.c);
+    }
+
+    return ret;
+}
+
+static int test_crop_2(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, 0, 0, 0, 0, a)
+           || test_crop(a, 0, 0, 0, 0, ncnn::Mat(27))
+
+           || test_crop(a, 11, 0, 0, 0, ncnn::Mat(7))
+           || test_crop(a, 12, 0, 0, 0, ncnn::Mat(12))
+           || test_crop(a, 16, 0, 0, 0, ncnn::Mat(16));
+}
+
+static int test_crop_5(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, 0, 0, 0, 0, a)
+
+           || test_crop(a, 0, 12, 0, 0, ncnn::Mat(8, 7))
+           || test_crop(a, 5, 0, 0, 0, ncnn::Mat(7, 27))
+
+           || test_crop(a, 5, 11, 0, 0, ncnn::Mat(5, 12))
+           || test_crop(a, 6, 12, 0, 0, ncnn::Mat(4, 16))
+           || test_crop(a, 4, 8, 0, 0, ncnn::Mat(6, 7));
+}
+
+static int test_crop_8(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, 0, 0, 0, 0, a)
+
+           || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6))
+           || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8))
+           || test_crop(a, 5, 2, 0, 0, ncnn::Mat(6, 3))
+           || test_crop(a, 6, 3, 0, 0, ncnn::Mat(8, 4))
+           || test_crop(a, 4, 4, 0, 0, ncnn::Mat(7, 5))
+
+           || test_crop(a, 5, 3, 0, 11, ncnn::Mat(7, 3, 7))
+           || test_crop(a, 6, 4, 0, 12, ncnn::Mat(6, 4, 12))
+           || test_crop(a, 4, 2, 0, 8, ncnn::Mat(5, 5, 16));
+}
+
+static int test_crop_11(const ncnn::Mat& a)
+{
+    return 0
+           || test_crop(a, 0, 0, 0, 0, a)
+
+           || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6, 6))
+           || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8, 8))
+           || test_crop(a, 5, 5, 5, 0, ncnn::Mat(6, 6, 6))
+           || test_crop(a, 6, 6, 6, 0, ncnn::Mat(8, 8, 8))
+           || test_crop(a, 4, 4, 4, 0, ncnn::Mat(5, 5, 5))
+
+           || test_crop(a, 3, 3, 3, 11, ncnn::Mat(3, 3, 3, 7))
+           || test_crop(a, 4, 4, 4, 12, ncnn::Mat(6, 6, 6, 12))
+           || test_crop(a, 5, 5, 5, 8, ncnn::Mat(8, 8, 8, 16));
+}
+
+int main()
+{
+    SRAND(776757);
+
+    return 0
+           || test_crop_2(RandomMat(112))
+           || test_crop_2(RandomMat(126))
+           || test_crop_2(RandomMat(127))
+           || test_crop_5(RandomMat(20, 48))
+           || test_crop_5(RandomMat(15, 36))
+           || test_crop_5(RandomMat(16, 33))
+           || test_crop_8(RandomMat(20, 20, 48))
+           || test_crop_8(RandomMat(15, 15, 36))
+           || test_crop_8(RandomMat(16, 16, 33))
+           || test_crop_11(RandomMat(20, 20, 20, 48))
+           || test_crop_11(RandomMat(15, 15, 15, 36))
+           || test_crop_11(RandomMat(16, 16, 16, 33));
+}
diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp
index 01511e54496..6cc2d614138 100644
--- a/tests/test_deformableconv2d.cpp
+++ b/tests/test_deformableconv2d.cpp
@@ -59,7 +59,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
 
 static int test_deformableconv2d_0()
 {
-    static const int kdsp[16][4] = {
+    static const int kdsp[10][4] = {
         {1, 1, 1, 0},
         {1, 1, 2, 0},
         {2, 1, 1, 1},
@@ -67,18 +67,12 @@ static int test_deformableconv2d_0()
         {3, 1, 1, 1},
         {3, 1, 2, 1},
         {3, 2, 1, 1},
-        {4, 1, 1, 0},
         {4, 1, 2, 1},
-        {4, 2, 1, 1},
-        {5, 1, 1, 2},
         {5, 1, 2, 2},
         {5, 2, 2, 2},
-        {7, 1, 1, 3},
-        {7, 1, 2, 3},
-        {7, 2, 1, 3},
     };
 
-    for (int i = 0; i < 16; i++)
+    for (int i = 0; i < 4; i++)
     {
         const int k = kdsp[i][0];
         const int d = kdsp[i][1];
@@ -93,7 +87,23 @@ static int test_deformableconv2d_0()
                   || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1)
                   || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0)
                   || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1)
-                  || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0);
+                  || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0)
+                  || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1);
 
         if (ret != 0)
             return -1;
diff --git a/tests/test_deformableconv2d_1.cpp b/tests/test_deformableconv2d_1.cpp
new file mode 100644
index 00000000000..2f2febf469d
--- /dev/null
+++ b/tests/test_deformableconv2d_1.cpp
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/deformableconv2d.h"
+#include "testutil.h"
+
+static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    const int kernel_extent_w = dilation * (kernel - 1) + 1;
+    const int kernel_extent_h = dilation * (kernel - 1) + 1;
+    const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1;
+    const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1;
+    std::vector<ncnn::Mat> a(3);
+    a[0] = RandomMat(w, h, c);
+    a[1] = RandomMat(out_w, out_h, kernel * kernel * 2);
+    a[2] = RandomMat(out_w, out_h, kernel * kernel);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    float epsilon = 0.001;
+    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_deformableconv2d_0()
+{
+    static const int kdsp[10][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, 0},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, 1},
+        {4, 1, 2, 1},
+        {5, 1, 2, 2},
+        {5, 2, 2, 2},
+    };
+
+    for (int i = 4; i < 6; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_deformableconv2d(9, 7, 1, 1, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 4, 13, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 13, 4, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 4, 8, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0)
+                  || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_deformableconv2d_0();
+}
diff --git a/tests/test_deformableconv2d_2.cpp b/tests/test_deformableconv2d_2.cpp
new file mode 100644
index 00000000000..130761d87d5
--- /dev/null
+++ b/tests/test_deformableconv2d_2.cpp
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/deformableconv2d.h"
+#include "testutil.h"
+
+static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    const int kernel_extent_w = dilation * (kernel - 1) + 1;
+    const int kernel_extent_h = dilation * (kernel - 1) + 1;
+    const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1;
+    const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1;
+    std::vector<ncnn::Mat> a(3);
+    a[0] = RandomMat(w, h, c);
+    a[1] = RandomMat(out_w, out_h, kernel * kernel * 2);
+    a[2] = RandomMat(out_w, out_h, kernel * kernel);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    float epsilon = 0.001;
+    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_deformableconv2d_0()
+{
+    static const int kdsp[10][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, 0},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, 1},
+        {4, 1, 2, 1},
+        {5, 1, 2, 2},
+        {5, 2, 2, 2},
+    };
+
+    for (int i = 6; i < 8; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_deformableconv2d(9, 7, 1, 1, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 4, 13, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 13, 4, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 4, 8, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0)
+                  || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_deformableconv2d_0();
+}
diff --git a/tests/test_deformableconv2d_3.cpp b/tests/test_deformableconv2d_3.cpp
new file mode 100644
index 00000000000..1a78a004db0
--- /dev/null
+++ b/tests/test_deformableconv2d_3.cpp
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/deformableconv2d.h"
+#include "testutil.h"
+
+static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    const int kernel_extent_w = dilation * (kernel - 1) + 1;
+    const int kernel_extent_h = dilation * (kernel - 1) + 1;
+    const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1;
+    const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1;
+    std::vector<ncnn::Mat> a(3);
+    a[0] = RandomMat(w, h, c);
+    a[1] = RandomMat(out_w, out_h, kernel * kernel * 2);
+    a[2] = RandomMat(out_w, out_h, kernel * kernel);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    float epsilon = 0.001;
+    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_deformableconv2d_0()
+{
+    static const int kdsp[10][4] = {
+        {1, 1, 1, 0},
+        {1, 1, 2, 0},
+        {2, 1, 1, 1},
+        {2, 1, 2, 0},
+        {3, 1, 1, 1},
+        {3, 1, 2, 1},
+        {3, 2, 1, 1},
+        {4, 1, 2, 1},
+        {5, 1, 2, 2},
+        {5, 2, 2, 2},
+    };
+
+    for (int i = 8; i < 10; i++)
+    {
+        const int k = kdsp[i][0];
+        const int d = kdsp[i][1];
+        const int s = kdsp[i][2];
+        const int p = kdsp[i][3];
+
+        int ret = 0
+                  || test_deformableconv2d(9, 7, 1, 1, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 4, 13, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 13, 4, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 4, 8, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0)
+                  || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1)
+                  || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0)
+                  || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1)
+                  || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1);
+
+        if (ret != 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_deformableconv2d_0();
+}
diff --git a/tests/test_deformableconv2d_4.cpp b/tests/test_deformableconv2d_4.cpp
new file mode 100644
index 00000000000..eca9f289dec
--- /dev/null
+++ b/tests/test_deformableconv2d_4.cpp
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/deformableconv2d.h"
+#include "testutil.h"
+
+static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    const int kernel_extent_w = dilation * (kernel - 1) + 1;
+    const int kernel_extent_h = dilation * (kernel - 1) + 1;
+    const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1;
+    const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1;
+    std::vector<ncnn::Mat> a(3);
+    a[0] = RandomMat(w, h, c);
+    a[1] = RandomMat(out_w, out_h, kernel * kernel * 2);
+    a[2] = RandomMat(out_w, out_h, kernel * kernel);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    float epsilon = 0.001;
+    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+    }
+
+    return ret;
+}
+
+static int test_deformableconv2d_0()
+{
+    return 0
+           || test_deformableconv2d(7, 5, 24, 32, 4, 2, 2, 2, 1)
+           || test_deformableconv2d(7, 5, 32, 24, 4, 2, 2, 2, 1)
+           || test_deformableconv2d(7, 5, 28, 32, 4, 2, 2, 2, 1)
+           || test_deformableconv2d(7, 5, 32, 28, 4, 2, 2, 2, 1)
+           || test_deformableconv2d(7, 5, 26, 32, 4, 2, 2, 2, 1)
+           || test_deformableconv2d(7, 5, 32, 26, 4, 2, 2, 2, 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_deformableconv2d_0();
+}
diff --git a/tests/test_expanddims.cpp b/tests/test_expanddims.cpp
index b18dfdcc065..d05d84a9d3b 100644
--- a/tests/test_expanddims.cpp
+++ b/tests/test_expanddims.cpp
@@ -15,11 +15,12 @@
 #include "layer/expanddims.h"
 #include "testutil.h"
 
-static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int expand_c)
+static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int expand_d, int expand_c)
 {
     ncnn::ParamDict pd;
     pd.set(0, expand_w);
     pd.set(1, expand_h);
+    pd.set(11, expand_d);
     pd.set(2, expand_c);
 
     std::vector<ncnn::Mat> weights(0);
@@ -27,7 +28,7 @@ static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int e
     int ret = test_layer<ncnn::ExpandDims>("ExpandDims", pd, weights, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_expanddims failed a.dims=%d a=(%d %d %d) expand_w=%d expand_h=%d expand_c=%d\n", a.dims, a.w, a.h, a.c, expand_w, expand_h, expand_c);
+        fprintf(stderr, "test_expanddims failed a.dims=%d a=(%d %d %d %d) expand_w=%d expand_h=%d expand_d=%d expand_c=%d\n", a.dims, a.w, a.h, a.d, a.c, expand_w, expand_h, expand_d, expand_c);
     }
 
     return ret;
@@ -60,6 +61,17 @@ static ncnn::Mat IntArrayMat(int a0, int a1, int a2)
     return m;
 }
 
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3)
+{
+    ncnn::Mat m(4);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    p[3] = a3;
+    return m;
+}
+
 static void print_int_array(const ncnn::Mat& a)
 {
     const int* pa = a;
@@ -82,7 +94,7 @@ static int test_expanddims_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
     int ret = test_layer<ncnn::ExpandDims>("ExpandDims", pd, weights, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_expanddims_axes failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
+        fprintf(stderr, "test_expanddims_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
         fprintf(stderr, " axes=");
         print_int_array(axes);
         fprintf(stderr, "\n");
@@ -91,48 +103,73 @@ static int test_expanddims_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
     return ret;
 }
 
-static int test_expand_0()
+static int test_expanddims_all_params(const ncnn::Mat& a)
 {
-    ncnn::Mat as[7];
-    as[0] = RandomMat(1, 1, 1);
-    as[1] = RandomMat(14, 16);
-    as[2] = RandomMat(1, 14);
-    as[3] = RandomMat(11, 1);
-    as[4] = RandomMat(1, 1);
-    as[5] = RandomMat(120);
-    as[6] = RandomMat(1);
-
-    for (int i = 0; i < 7; i++)
-    {
-        const ncnn::Mat& a = as[i];
-        int ret = 0
-                  || test_expanddims(a, 0, 0, 0)
-                  || test_expanddims(a, 0, 0, 1)
-                  || test_expanddims(a, 0, 1, 0)
-                  || test_expanddims(a, 0, 1, 1)
-                  || test_expanddims(a, 1, 0, 0)
-                  || test_expanddims(a, 1, 0, 1)
-                  || test_expanddims(a, 1, 1, 0)
-                  || test_expanddims(a, 1, 1, 1)
-
-                  || test_expanddims_axes(a, IntArrayMat(0))
-                  || test_expanddims_axes(a, IntArrayMat(1))
-                  || test_expanddims_axes(a, IntArrayMat(2))
-                  || test_expanddims_axes(a, IntArrayMat(0, 1))
-                  || test_expanddims_axes(a, IntArrayMat(0, 2))
-                  || test_expanddims_axes(a, IntArrayMat(1, 2))
-                  || test_expanddims_axes(a, IntArrayMat(0, 1, 2));
-
-        if (ret != 0)
-            return ret;
-    }
+    return 0
+           || test_expanddims(a, 0, 0, 0, 0)
+           || test_expanddims(a, 0, 0, 0, 1)
+           || test_expanddims(a, 0, 0, 1, 0)
+           || test_expanddims(a, 0, 0, 1, 1)
+           || test_expanddims(a, 0, 1, 0, 0)
+           || test_expanddims(a, 0, 1, 0, 1)
+           || test_expanddims(a, 0, 1, 1, 0)
+           || test_expanddims(a, 0, 1, 1, 1)
+           || test_expanddims(a, 1, 0, 0, 0)
+           || test_expanddims(a, 1, 0, 0, 1)
+           || test_expanddims(a, 1, 0, 1, 0)
+           || test_expanddims(a, 1, 0, 1, 1)
+           || test_expanddims(a, 1, 1, 0, 0)
+           || test_expanddims(a, 1, 1, 0, 1)
+           || test_expanddims(a, 1, 1, 1, 0)
+           || test_expanddims(a, 1, 1, 1, 1)
+
+           || test_expanddims_axes(a, IntArrayMat(0))
+           || test_expanddims_axes(a, IntArrayMat(1))
+           || test_expanddims_axes(a, IntArrayMat(2))
+           || test_expanddims_axes(a, IntArrayMat(3))
+           || test_expanddims_axes(a, IntArrayMat(0, 1))
+           || test_expanddims_axes(a, IntArrayMat(0, 2))
+           || test_expanddims_axes(a, IntArrayMat(0, 3))
+           || test_expanddims_axes(a, IntArrayMat(1, 2))
+           || test_expanddims_axes(a, IntArrayMat(1, 3))
+           || test_expanddims_axes(a, IntArrayMat(2, 3))
+           || test_expanddims_axes(a, IntArrayMat(0, 1, 2))
+           || test_expanddims_axes(a, IntArrayMat(0, 1, 3))
+           || test_expanddims_axes(a, IntArrayMat(0, 2, 3))
+           || test_expanddims_axes(a, IntArrayMat(1, 2, 3))
+           || test_expanddims_axes(a, IntArrayMat(0, 1, 2, 3));
+}
+
+static int test_expanddims_0()
+{
+    return 0
+           || test_expanddims_all_params(RandomMat(3, 12, 16))
+           || test_expanddims_all_params(RandomMat(3, 1, 16))
+           || test_expanddims_all_params(RandomMat(1, 33, 15))
+           || test_expanddims_all_params(RandomMat(1, 14, 1))
+           || test_expanddims_all_params(RandomMat(12, 13, 1))
+           || test_expanddims_all_params(RandomMat(1, 1, 1));
+}
 
-    return 0;
+static int test_expanddims_1()
+{
+    return 0
+           || test_expanddims_all_params(RandomMat(14, 16))
+           || test_expanddims_all_params(RandomMat(1, 14))
+           || test_expanddims_all_params(RandomMat(11, 1))
+           || test_expanddims_all_params(RandomMat(1, 1));
+}
+
+static int test_expanddims_2()
+{
+    return 0
+           || test_expanddims_all_params(RandomMat(120))
+           || test_expanddims_all_params(RandomMat(1));
 }
 
 int main()
 {
     SRAND(7767517);
 
-    return test_expand_0();
+    return test_expanddims_0() || test_expanddims_1() || test_expanddims_2();
 }
diff --git a/tests/test_fold.cpp b/tests/test_fold.cpp
new file mode 100644
index 00000000000..11a38428fdd
--- /dev/null
+++ b/tests/test_fold.cpp
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/fold.h"
+#include "testutil.h"
+
+static int test_fold(int w, int h, int outw, int outh, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h)
+{
+    ncnn::Mat a = RandomMat(w, h);
+
+    ncnn::ParamDict pd;
+    pd.set(1, kernel_w);
+    pd.set(11, kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_w);
+    pd.set(14, pad_h);
+    pd.set(20, outw);
+    pd.set(21, outh);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::Fold>("Fold", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_fold failed w=%d h=%d outw=%d outh=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d\n", w, h, outw, outh, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h);
+    }
+
+    return ret;
+}
+
+static int test_fold_0()
+{
+    return 0
+           || test_fold(400, 108, 22, 22, 3, 3, 1, 1, 1, 1, 0, 0)
+           || test_fold(190, 96, 18, 17, 4, 2, 1, 1, 1, 2, 2, 2)
+           || test_fold(120, 36, 11, 5, 3, 2, 2, 1, 1, 1, 4, 2);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_fold_0();
+}
diff --git a/tests/test_gelu.cpp b/tests/test_gelu.cpp
index 974079edea8..f4ac70cf8e2 100644
--- a/tests/test_gelu.cpp
+++ b/tests/test_gelu.cpp
@@ -34,6 +34,8 @@ static int test_gelu(const ncnn::Mat& a, bool fast_gelu)
 static int test_gelu_0()
 {
     return 0
+           || test_gelu(RandomMat(9, 7, 32), false)
+           || test_gelu(RandomMat(9, 7, 32), true)
            || test_gelu(RandomMat(5, 7, 24), false)
            || test_gelu(RandomMat(5, 7, 24), true)
            || test_gelu(RandomMat(7, 9, 12), false)
@@ -45,6 +47,8 @@ static int test_gelu_0()
 static int test_gelu_1()
 {
     return 0
+           || test_gelu(RandomMat(13, 32), false)
+           || test_gelu(RandomMat(13, 32), true)
            || test_gelu(RandomMat(15, 24), false)
            || test_gelu(RandomMat(15, 24), true)
            || test_gelu(RandomMat(17, 12), false)
@@ -61,7 +65,9 @@ static int test_gelu_2()
            || test_gelu(RandomMat(124), false)
            || test_gelu(RandomMat(124), true)
            || test_gelu(RandomMat(127), false)
-           || test_gelu(RandomMat(127), true);
+           || test_gelu(RandomMat(127), true)
+           || test_gelu(RandomMat(120), false)
+           || test_gelu(RandomMat(120), true);
 }
 
 int main()
diff --git a/tests/test_glu.cpp b/tests/test_glu.cpp
new file mode 100644
index 00000000000..58555aa5357
--- /dev/null
+++ b/tests/test_glu.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 Xiaomi Corp.        (author: Fangjun Kuang)
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/glu.h"
+#include "testutil.h"
+
+static int test_glu(const ncnn::Mat& a, int axis)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::GLU>("GLU", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_glu failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
+    }
+
+    return ret;
+}
+
+static int test_glu_0()
+{
+    return 0
+           || test_glu(RandomMat(6, 7, 24), 0)
+           || test_glu(RandomMat(6, 8, 24), 1)
+           || test_glu(RandomMat(6, 8, 24), 2)
+           || test_glu(RandomMat(36, 7, 22), 0)
+           || test_glu(RandomMat(5, 256, 23), -2)
+           || test_glu(RandomMat(129, 9, 60), 2)
+           || test_glu(RandomMat(129, 9, 30), -1);
+}
+
+static int test_glu_1()
+{
+    return 0
+           || test_glu(RandomMat(10, 24), 0)
+           || test_glu(RandomMat(7, 24), 1)
+           || test_glu(RandomMat(128, 22), 0)
+           || test_glu(RandomMat(128, 256), 1);
+}
+
+static int test_glu_2()
+{
+    return 0
+           || test_glu(RandomMat(10), 0)
+           || test_glu(RandomMat(20), 0)
+           || test_glu(RandomMat(128), 0);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_glu_0()
+           || test_glu_1()
+           || test_glu_2();
+}
diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp
new file mode 100644
index 00000000000..70c96b30480
--- /dev/null
+++ b/tests/test_gridsample.cpp
@@ -0,0 +1,131 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/gridsample.h"
+#include "testutil.h"
+
+static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, sample_type);
+    pd.set(1, padding_mode);
+    pd.set(2, align_corner);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> as(2);
+    as[0] = a;
+    as[1] = grid;
+
+    int ret = test_layer<ncnn::GridSample>("GridSample", pd, weights, as);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d",
+                a.dims, a.w, a.h, a.d, a.c, grid.dims, grid.w, grid.h, grid.d, grid.c,
+                sample_type, padding_mode, align_corner);
+    }
+
+    return ret;
+}
+
+static int test_gridsample_0()
+{
+    return 0
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 1);
+}
+
+static int test_gridsample_1()
+{
+    return 0
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 1);
+}
+
+static int test_gridsample_2()
+{
+    return 0
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 1);
+}
+
+static int test_gridsample_3()
+{
+    return 0
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 1)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 0)
+           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_gridsample_0()
+           || test_gridsample_1()
+           || test_gridsample_2()
+           || test_gridsample_3();
+}
diff --git a/tests/test_groupnorm.cpp b/tests/test_groupnorm.cpp
index e1e831c066c..5f3c5c56916 100644
--- a/tests/test_groupnorm.cpp
+++ b/tests/test_groupnorm.cpp
@@ -38,6 +38,17 @@ static int test_groupnorm(const ncnn::Mat& a, int group, float eps)
 }
 
 static int test_groupnorm_0()
+{
+    return 0
+           || test_groupnorm(RandomMat(3, 6, 4, 2), 1, 0.01f)
+           || test_groupnorm(RandomMat(2, 3, 3, 8), 2, 0.002f)
+           || test_groupnorm(RandomMat(3, 4, 5, 6), 3, 0.01f)
+           || test_groupnorm(RandomMat(4, 5, 6, 12), 4, 0.02f)
+           || test_groupnorm(RandomMat(5, 6, 7, 24), 2, 0.001f)
+           || test_groupnorm(RandomMat(2, 8, 9, 24), 3, 0.0001f);
+}
+
+static int test_groupnorm_1()
 {
     return 0
            || test_groupnorm(RandomMat(6, 4, 2), 1, 0.01f)
@@ -48,10 +59,35 @@ static int test_groupnorm_0()
            || test_groupnorm(RandomMat(8, 9, 24), 3, 0.0001f);
 }
 
+static int test_groupnorm_2()
+{
+    return 0
+           || test_groupnorm(RandomMat(24, 2), 1, 0.01f)
+           || test_groupnorm(RandomMat(23, 8), 2, 0.002f)
+           || test_groupnorm(RandomMat(25, 6), 3, 0.01f)
+           || test_groupnorm(RandomMat(26, 12), 4, 0.02f)
+           || test_groupnorm(RandomMat(27, 24), 2, 0.001f)
+           || test_groupnorm(RandomMat(29, 24), 3, 0.0001f);
+}
+
+static int test_groupnorm_3()
+{
+    return 0
+           || test_groupnorm(RandomMat(12), 1, 0.01f)
+           || test_groupnorm(RandomMat(18), 2, 0.002f)
+           || test_groupnorm(RandomMat(36), 3, 0.01f)
+           || test_groupnorm(RandomMat(212), 4, 0.02f)
+           || test_groupnorm(RandomMat(124), 2, 0.001f)
+           || test_groupnorm(RandomMat(324), 3, 0.0001f);
+}
+
 int main()
 {
     SRAND(7767517);
 
     return 0
-           || test_groupnorm_0();
+           || test_groupnorm_0()
+           || test_groupnorm_1()
+           || test_groupnorm_2()
+           || test_groupnorm_3();
 }
diff --git a/tests/test_lstm.cpp b/tests/test_lstm.cpp
index f002a1aeccf..fb76ad0fbd7 100644
--- a/tests/test_lstm.cpp
+++ b/tests/test_lstm.cpp
@@ -15,50 +15,64 @@
 #include "layer/lstm.h"
 #include "testutil.h"
 
-static int test_lstm(const ncnn::Mat& a, int outch, int direction)
+static int test_lstm(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0)
 {
     int input_size = a.w;
     int num_directions = direction == 2 ? 2 : 1;
+    if (hidden_size == 0)
+        hidden_size = outch;
 
     ncnn::ParamDict pd;
     pd.set(0, outch);
-    pd.set(1, outch * input_size * 4 * num_directions);
+    pd.set(1, hidden_size * input_size * 4 * num_directions);
     pd.set(2, direction);
+    pd.set(3, hidden_size);
 
-    std::vector<ncnn::Mat> weights(3);
-    weights[0] = RandomMat(outch * input_size * 4 * num_directions);
-    weights[1] = RandomMat(outch * 4 * num_directions);
-    weights[2] = RandomMat(outch * outch * 4 * num_directions);
+    std::vector<ncnn::Mat> weights(hidden_size == 0 ? 3 : 4);
+    weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions);
+    weights[1] = RandomMat(hidden_size * 4 * num_directions);
+    weights[2] = RandomMat(outch * hidden_size * 4 * num_directions);
+    if (hidden_size)
+    {
+        weights[3] = RandomMat(hidden_size * outch * num_directions);
+    }
 
     int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_lstm failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
+        fprintf(stderr, "test_lstm failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
     }
 
     return ret;
 }
 
-int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
+int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0)
 {
     int input_size = a.w;
     int num_directions = direction == 2 ? 2 : 1;
+    if (hidden_size == 0)
+        hidden_size = outch;
 
     ncnn::ParamDict pd;
     pd.set(0, outch);
-    pd.set(1, outch * input_size * 4 * num_directions);
+    pd.set(1, hidden_size * input_size * 4 * num_directions);
     pd.set(2, direction);
+    pd.set(3, hidden_size);
 
-    std::vector<ncnn::Mat> weights(3);
-    weights[0] = RandomMat(outch * input_size * 4 * num_directions);
-    weights[1] = RandomMat(outch * 4 * num_directions);
-    weights[2] = RandomMat(outch * outch * 4 * num_directions);
+    std::vector<ncnn::Mat> weights(hidden_size == 0 ? 3 : 4);
+    weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions);
+    weights[1] = RandomMat(hidden_size * 4 * num_directions);
+    weights[2] = RandomMat(outch * hidden_size * 4 * num_directions);
+    if (hidden_size)
+    {
+        weights[3] = RandomMat(hidden_size * outch * num_directions);
+    }
 
     // initial hidden state
     ncnn::Mat hidden = RandomMat(outch, num_directions);
 
     // initial cell state
-    ncnn::Mat cell = RandomMat(outch, num_directions);
+    ncnn::Mat cell = RandomMat(hidden_size, num_directions);
 
     std::vector<ncnn::Mat> as(3);
     as[0] = a;
@@ -68,32 +82,39 @@ int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
     int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3);
     if (ret != 0)
     {
-        fprintf(stderr, "test_lstm_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
+        fprintf(stderr, "test_lstm_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
     }
 
     return ret;
 }
 
-int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction)
+int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0)
 {
     int input_size = a.w;
     int num_directions = direction == 2 ? 2 : 1;
+    if (hidden_size == 0)
+        hidden_size = outch;
 
     ncnn::ParamDict pd;
     pd.set(0, outch);
-    pd.set(1, outch * input_size * 4 * num_directions);
+    pd.set(1, hidden_size * input_size * 4 * num_directions);
     pd.set(2, direction);
+    pd.set(3, hidden_size);
 
-    std::vector<ncnn::Mat> weights(3);
-    weights[0] = RandomMat(outch * input_size * 4 * num_directions);
-    weights[1] = RandomMat(outch * 4 * num_directions);
-    weights[2] = RandomMat(outch * outch * 4 * num_directions);
+    std::vector<ncnn::Mat> weights(hidden_size == 0 ? 3 : 4);
+    weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions);
+    weights[1] = RandomMat(hidden_size * 4 * num_directions);
+    weights[2] = RandomMat(outch * hidden_size * 4 * num_directions);
+    if (hidden_size)
+    {
+        weights[3] = RandomMat(hidden_size * outch * num_directions);
+    }
 
     // initial hidden state
     ncnn::Mat hidden = RandomMat(outch, num_directions);
 
     // initial cell state
-    ncnn::Mat cell = RandomMat(outch, num_directions);
+    ncnn::Mat cell = RandomMat(hidden_size, num_directions);
 
     std::vector<ncnn::Mat> as(3);
     as[0] = a;
@@ -103,26 +124,33 @@ int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directi
     int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 1);
     if (ret != 0)
     {
-        fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
+        fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
     }
 
     return ret;
 }
 
-int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction)
+int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0)
 {
     int input_size = a.w;
     int num_directions = direction == 2 ? 2 : 1;
+    if (hidden_size == 0)
+        hidden_size = outch;
 
     ncnn::ParamDict pd;
     pd.set(0, outch);
-    pd.set(1, outch * input_size * 4 * num_directions);
+    pd.set(1, hidden_size * input_size * 4 * num_directions);
     pd.set(2, direction);
+    pd.set(3, hidden_size);
 
-    std::vector<ncnn::Mat> weights(3);
-    weights[0] = RandomMat(outch * input_size * 4 * num_directions);
-    weights[1] = RandomMat(outch * 4 * num_directions);
-    weights[2] = RandomMat(outch * outch * 4 * num_directions);
+    std::vector<ncnn::Mat> weights(hidden_size == 0 ? 3 : 4);
+    weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions);
+    weights[1] = RandomMat(hidden_size * 4 * num_directions);
+    weights[2] = RandomMat(outch * hidden_size * 4 * num_directions);
+    if (hidden_size)
+    {
+        weights[3] = RandomMat(hidden_size * outch * num_directions);
+    }
 
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
@@ -130,7 +158,7 @@ int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direct
     int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3);
     if (ret != 0)
     {
-        fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
+        fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
     }
 
     return ret;
@@ -147,7 +175,7 @@ static int test_lstm_0()
            || test_lstm(RandomMat(5, 16), 16, 2)
            || test_lstm(RandomMat(3, 16), 8, 2)
            || test_lstm(RandomMat(8, 16), 16, 2)
-           || test_lstm(RandomMat(2, 5), 17, 2);
+           || test_lstm(RandomMat(2, 5), 17, 2, 15);
 }
 
 static int test_lstm_1()
@@ -160,7 +188,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 2)
            || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 2)
            || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 2)
-           || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 2)
+           || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 2, 33)
            || test_lstm_layer_with_hidden(RandomMat(4, 4), 1, 1)
            || test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 1)
            || test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 1)
@@ -168,7 +196,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 1)
            || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 1)
            || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 1)
-           || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 1)
+           || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 1, 33)
            || test_lstm_layer_with_hidden(RandomMat(4, 2), 1, 0)
            || test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 0)
            || test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 0)
@@ -176,7 +204,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 0)
            || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 0)
            || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 0)
-           || test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0)
+           || test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0, 15)
 
            || test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 2)
            || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 2)
@@ -185,7 +213,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 2)
            || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 2)
            || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 2)
-           || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 2)
+           || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 2, 33)
            || test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 1)
            || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 1)
            || test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 1)
@@ -193,7 +221,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 1)
            || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 1)
            || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 1)
-           || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 1)
+           || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 1, 33)
            || test_lstm_layer_with_hidden_input(RandomMat(4, 2), 1, 0)
            || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 0)
            || test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 0)
@@ -201,7 +229,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 0)
            || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 0)
            || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 0)
-           || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 17, 0)
+           || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 17, 0, 15)
 
            || test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 2)
            || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 2)
@@ -210,7 +238,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 2)
            || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 2)
            || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 2)
-           || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 2)
+           || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 2, 33)
            || test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 1)
            || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 1)
            || test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 1)
@@ -218,7 +246,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 1)
            || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 1)
            || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 1)
-           || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 1)
+           || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 1, 33)
            || test_lstm_layer_with_hidden_output(RandomMat(4, 2), 1, 0)
            || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 0)
            || test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 0)
@@ -226,7 +254,7 @@ static int test_lstm_1()
            || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 0)
            || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 0)
            || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 0)
-           || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 17, 0);
+           || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 17, 0, 15);
 }
 
 static int test_lstm_2()
@@ -240,7 +268,7 @@ static int test_lstm_2()
            || test_lstm(RandomMat(5, 16), 16, 0)
            || test_lstm(RandomMat(3, 16), 8, 0)
            || test_lstm(RandomMat(8, 16), 16, 0)
-           || test_lstm(RandomMat(2, 5), 17, 0);
+           || test_lstm(RandomMat(2, 5), 17, 0, 15);
 }
 static int test_lstm_3()
 {
@@ -253,7 +281,7 @@ static int test_lstm_3()
            || test_lstm(RandomMat(5, 16), 16, 1)
            || test_lstm(RandomMat(3, 16), 8, 1)
            || test_lstm(RandomMat(8, 16), 16, 1)
-           || test_lstm(RandomMat(2, 5), 17, 1);
+           || test_lstm(RandomMat(2, 5), 17, 1, 15);
 }
 
 int main()
diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp
index f4e0b1b44f5..e7440fd55bd 100644
--- a/tests/test_multiheadattention.cpp
+++ b/tests/test_multiheadattention.cpp
@@ -15,34 +15,70 @@
 #include "layer/multiheadattention.h"
 #include "testutil.h"
 
-static int test_multiheadattention(const ncnn::Mat& a, int num_heads)
+static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const ncnn::Mat& v, int num_heads, int kdim, int vdim)
 {
-    int embed_dim = a.w;
+    int embed_dim = q.w;
 
     ncnn::ParamDict pd;
     pd.set(0, embed_dim);
     pd.set(1, num_heads);
     pd.set(2, embed_dim * embed_dim);
+    pd.set(3, kdim);
+    pd.set(4, vdim);
 
     std::vector<ncnn::Mat> weights(8);
     weights[0] = RandomMat(embed_dim * embed_dim);
     weights[1] = RandomMat(embed_dim);
-    weights[2] = RandomMat(embed_dim * embed_dim);
+    weights[2] = RandomMat(embed_dim * kdim);
     weights[3] = RandomMat(embed_dim);
-    weights[4] = RandomMat(embed_dim * embed_dim);
+    weights[4] = RandomMat(embed_dim * vdim);
     weights[5] = RandomMat(embed_dim);
     weights[6] = RandomMat(embed_dim * embed_dim);
     weights[7] = RandomMat(embed_dim);
 
     std::vector<ncnn::Mat> as(3);
-    as[0] = a;
-    as[1] = a;
-    as[2] = a;
+    as[0] = q;
+    as[1] = k;
+    as[2] = v;
 
     int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as);
     if (ret != 0)
     {
-        fprintf(stderr, "test_multiheadattention failed a=(%d %d)\n", a.w, a.h);
+        fprintf(stderr, "test_multiheadattention failed q=(%d %d) k=(%d %d) v=(%d %d)\n", q.w, q.h, k.w, k.h, v.w, v.h);
+    }
+
+    return ret;
+}
+
+static int test_multiheadattention_samekv(const ncnn::Mat& q, const ncnn::Mat& kv, int num_heads, int kvdim)
+{
+    int embed_dim = q.w;
+
+    ncnn::ParamDict pd;
+    pd.set(0, embed_dim);
+    pd.set(1, num_heads);
+    pd.set(2, embed_dim * embed_dim);
+    pd.set(3, kvdim);
+    pd.set(4, kvdim);
+
+    std::vector<ncnn::Mat> weights(8);
+    weights[0] = RandomMat(embed_dim * embed_dim);
+    weights[1] = RandomMat(embed_dim);
+    weights[2] = RandomMat(embed_dim * kvdim);
+    weights[3] = RandomMat(embed_dim);
+    weights[4] = RandomMat(embed_dim * kvdim);
+    weights[5] = RandomMat(embed_dim);
+    weights[6] = RandomMat(embed_dim * embed_dim);
+    weights[7] = RandomMat(embed_dim);
+
+    std::vector<ncnn::Mat> as(2);
+    as[0] = q;
+    as[1] = kv;
+
+    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_multiheadattention failed q=(%d %d) kv=(%d %d)\n", q.w, q.h, kv.w, kv.h);
     }
 
     return ret;
@@ -82,11 +118,26 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int num_heads)
 static int test_multiheadattention_0()
 {
     return 0
-           || test_multiheadattention(RandomMat(64, 128), 4)
-           || test_multiheadattention(RandomMat(64, 127), 16);
+           || test_multiheadattention(RandomMat(64, 128), RandomMat(64, 128), RandomMat(64, 128), 4, 64, 64)
+           || test_multiheadattention(RandomMat(64, 127), RandomMat(64, 127), RandomMat(64, 127), 16, 64, 64)
+           || test_multiheadattention(RandomMat(16, 128), RandomMat(44, 128), RandomMat(55, 128), 2, 44, 55)
+           || test_multiheadattention(RandomMat(16, 128), RandomMat(44, 127), RandomMat(55, 127), 4, 44, 55)
+           || test_multiheadattention(RandomMat(12, 17), RandomMat(28, 127), RandomMat(32, 127), 3, 28, 32)
+           || test_multiheadattention(RandomMat(12, 17), RandomMat(28, 32), RandomMat(11, 32), 3, 28, 11);
 }
 
 static int test_multiheadattention_1()
+{
+    return 0
+           || test_multiheadattention_samekv(RandomMat(64, 128), RandomMat(64, 128), 4, 64)
+           || test_multiheadattention_samekv(RandomMat(64, 127), RandomMat(64, 127), 16, 64)
+           || test_multiheadattention_samekv(RandomMat(16, 128), RandomMat(44, 128), 2, 44)
+           || test_multiheadattention_samekv(RandomMat(16, 128), RandomMat(22, 127), 4, 22)
+           || test_multiheadattention_samekv(RandomMat(12, 17), RandomMat(28, 127), 3, 28)
+           || test_multiheadattention_samekv(RandomMat(12, 17), RandomMat(11, 32), 3, 11);
+}
+
+static int test_multiheadattention_2()
 {
     return 0
            || test_multiheadattention_sameqkv(RandomMat(64, 128), 8)
@@ -99,5 +150,6 @@ int main()
 
     return 0
            || test_multiheadattention_0()
-           || test_multiheadattention_1();
+           || test_multiheadattention_1()
+           || test_multiheadattention_2();
 }
diff --git a/tests/test_prelu.cpp b/tests/test_prelu.cpp
index 7305dc899d5..4184a288ada 100644
--- a/tests/test_prelu.cpp
+++ b/tests/test_prelu.cpp
@@ -37,6 +37,8 @@ static int test_prelu_0()
     return 0
            || test_prelu(RandomMat(5, 7, 24), 24)
            || test_prelu(RandomMat(5, 7, 24), 1)
+           || test_prelu(RandomMat(5, 7, 32), 32)
+           || test_prelu(RandomMat(5, 7, 32), 1)
            || test_prelu(RandomMat(7, 9, 12), 12)
            || test_prelu(RandomMat(7, 9, 12), 1)
            || test_prelu(RandomMat(3, 5, 13), 13)
@@ -48,6 +50,8 @@ static int test_prelu_1()
     return 0
            || test_prelu(RandomMat(15, 24), 24)
            || test_prelu(RandomMat(15, 24), 1)
+           || test_prelu(RandomMat(15, 32), 32)
+           || test_prelu(RandomMat(15, 32), 1)
            || test_prelu(RandomMat(17, 12), 12)
            || test_prelu(RandomMat(17, 12), 1)
            || test_prelu(RandomMat(19, 15), 15)
@@ -61,6 +65,8 @@ static int test_prelu_2()
            || test_prelu(RandomMat(128), 1)
            || test_prelu(RandomMat(124), 124)
            || test_prelu(RandomMat(124), 1)
+           || test_prelu(RandomMat(120), 120)
+           || test_prelu(RandomMat(120), 1)
            || test_prelu(RandomMat(127), 127)
            || test_prelu(RandomMat(127), 1);
 }
diff --git a/tests/test_squeeze.cpp b/tests/test_squeeze.cpp
index 6834349e0fb..403f95bdf9b 100644
--- a/tests/test_squeeze.cpp
+++ b/tests/test_squeeze.cpp
@@ -15,11 +15,12 @@
 #include "layer/squeeze.h"
 #include "testutil.h"
 
-static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int squeeze_c)
+static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int squeeze_d, int squeeze_c)
 {
     ncnn::ParamDict pd;
     pd.set(0, squeeze_w);
     pd.set(1, squeeze_h);
+    pd.set(11, squeeze_d);
     pd.set(2, squeeze_c);
 
     std::vector<ncnn::Mat> weights(0);
@@ -27,7 +28,7 @@ static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int sq
     int ret = test_layer<ncnn::Squeeze>("Squeeze", pd, weights, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_squeeze failed a.dims=%d a=(%d %d %d) squeeze_w=%d squeeze_h=%d squeeze_c=%d\n", a.dims, a.w, a.h, a.c, squeeze_w, squeeze_h, squeeze_c);
+        fprintf(stderr, "test_squeeze failed a.dims=%d a=(%d %d %d %d) squeeze_w=%d squeeze_h=%d squeeze_d=%d squeeze_c=%d\n", a.dims, a.w, a.h, a.d, a.c, squeeze_w, squeeze_h, squeeze_d, squeeze_c);
     }
 
     return ret;
@@ -60,6 +61,17 @@ static ncnn::Mat IntArrayMat(int a0, int a1, int a2)
     return m;
 }
 
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3)
+{
+    ncnn::Mat m(4);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    p[3] = a3;
+    return m;
+}
+
 static void print_int_array(const ncnn::Mat& a)
 {
     const int* pa = a;
@@ -82,7 +94,7 @@ static int test_squeeze_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
     int ret = test_layer<ncnn::Squeeze>("Squeeze", pd, weights, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_squeeze_axes failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
+        fprintf(stderr, "test_squeeze_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
         fprintf(stderr, " axes=");
         print_int_array(axes);
         fprintf(stderr, "\n");
@@ -91,53 +103,93 @@ static int test_squeeze_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
     return ret;
 }
 
+static int test_squeeze_all_params(const ncnn::Mat& a)
+{
+    return 0
+           || test_squeeze(a, 0, 0, 0, 0)
+           || test_squeeze(a, 0, 0, 0, 1)
+           || test_squeeze(a, 0, 0, 1, 0)
+           || test_squeeze(a, 0, 0, 1, 1)
+           || test_squeeze(a, 0, 1, 0, 0)
+           || test_squeeze(a, 0, 1, 0, 1)
+           || test_squeeze(a, 0, 1, 1, 0)
+           || test_squeeze(a, 0, 1, 1, 1)
+           || test_squeeze(a, 1, 0, 0, 0)
+           || test_squeeze(a, 1, 0, 0, 1)
+           || test_squeeze(a, 1, 0, 1, 0)
+           || test_squeeze(a, 1, 0, 1, 1)
+           || test_squeeze(a, 1, 1, 0, 0)
+           || test_squeeze(a, 1, 1, 0, 1)
+           || test_squeeze(a, 1, 1, 1, 0)
+           || test_squeeze(a, 1, 1, 1, 1)
+
+           || test_squeeze_axes(a, IntArrayMat(0))
+           || test_squeeze_axes(a, IntArrayMat(1))
+           || test_squeeze_axes(a, IntArrayMat(2))
+           || test_squeeze_axes(a, IntArrayMat(3))
+           || test_squeeze_axes(a, IntArrayMat(0, 1))
+           || test_squeeze_axes(a, IntArrayMat(0, 2))
+           || test_squeeze_axes(a, IntArrayMat(0, 3))
+           || test_squeeze_axes(a, IntArrayMat(1, 2))
+           || test_squeeze_axes(a, IntArrayMat(1, 3))
+           || test_squeeze_axes(a, IntArrayMat(2, 3))
+           || test_squeeze_axes(a, IntArrayMat(0, 1, 2))
+           || test_squeeze_axes(a, IntArrayMat(0, 1, 3))
+           || test_squeeze_axes(a, IntArrayMat(0, 2, 3))
+           || test_squeeze_axes(a, IntArrayMat(1, 2, 3))
+           || test_squeeze_axes(a, IntArrayMat(0, 1, 2, 3));
+}
+
 static int test_squeeze_0()
 {
-    ncnn::Mat as[12];
-    as[0] = RandomMat(3, 12, 16);
-    as[1] = RandomMat(3, 1, 16);
-    as[2] = RandomMat(1, 33, 15);
-    as[3] = RandomMat(1, 14, 1);
-    as[4] = RandomMat(12, 13, 1);
-    as[5] = RandomMat(1, 1, 1);
-    as[6] = RandomMat(14, 16);
-    as[7] = RandomMat(1, 14);
-    as[8] = RandomMat(11, 1);
-    as[9] = RandomMat(1, 1);
-    as[10] = RandomMat(120);
-    as[11] = RandomMat(1);
-
-    for (int i = 0; i < 12; i++)
-    {
-        const ncnn::Mat& a = as[i];
-        int ret = 0
-                  || test_squeeze(a, 0, 0, 0)
-                  || test_squeeze(a, 0, 0, 1)
-                  || test_squeeze(a, 0, 1, 0)
-                  || test_squeeze(a, 0, 1, 1)
-                  || test_squeeze(a, 1, 0, 0)
-                  || test_squeeze(a, 1, 0, 1)
-                  || test_squeeze(a, 1, 1, 0)
-                  || test_squeeze(a, 1, 1, 1)
-
-                  || test_squeeze_axes(a, IntArrayMat(0))
-                  || test_squeeze_axes(a, IntArrayMat(1))
-                  || test_squeeze_axes(a, IntArrayMat(2))
-                  || test_squeeze_axes(a, IntArrayMat(0, 1))
-                  || test_squeeze_axes(a, IntArrayMat(0, 2))
-                  || test_squeeze_axes(a, IntArrayMat(1, 2))
-                  || test_squeeze_axes(a, IntArrayMat(0, 1, 2));
-
-        if (ret != 0)
-            return ret;
-    }
+    return 0
+           || test_squeeze_all_params(RandomMat(4, 5, 7, 16))
+           || test_squeeze_all_params(RandomMat(4, 5, 1, 15))
+           || test_squeeze_all_params(RandomMat(4, 1, 7, 12))
+           || test_squeeze_all_params(RandomMat(1, 5, 7, 16))
+           || test_squeeze_all_params(RandomMat(1, 5, 1, 15))
+           || test_squeeze_all_params(RandomMat(1, 1, 7, 12))
+           || test_squeeze_all_params(RandomMat(6, 1, 1, 16))
+           || test_squeeze_all_params(RandomMat(1, 1, 1, 15))
+           || test_squeeze_all_params(RandomMat(4, 5, 7, 1))
+           || test_squeeze_all_params(RandomMat(4, 5, 1, 1))
+           || test_squeeze_all_params(RandomMat(4, 1, 7, 1))
+           || test_squeeze_all_params(RandomMat(1, 5, 7, 1))
+           || test_squeeze_all_params(RandomMat(1, 5, 1, 1))
+           || test_squeeze_all_params(RandomMat(1, 1, 7, 1))
+           || test_squeeze_all_params(RandomMat(1, 1, 1, 1));
+}
+
+static int test_squeeze_1()
+{
+    return 0
+           || test_squeeze_all_params(RandomMat(3, 12, 16))
+           || test_squeeze_all_params(RandomMat(3, 1, 16))
+           || test_squeeze_all_params(RandomMat(1, 33, 15))
+           || test_squeeze_all_params(RandomMat(1, 14, 1))
+           || test_squeeze_all_params(RandomMat(12, 13, 1))
+           || test_squeeze_all_params(RandomMat(1, 1, 1));
+}
 
-    return 0;
+static int test_squeeze_2()
+{
+    return 0
+           || test_squeeze_all_params(RandomMat(14, 16))
+           || test_squeeze_all_params(RandomMat(1, 14))
+           || test_squeeze_all_params(RandomMat(11, 1))
+           || test_squeeze_all_params(RandomMat(1, 1));
+}
+
+static int test_squeeze_3()
+{
+    return 0
+           || test_squeeze_all_params(RandomMat(120))
+           || test_squeeze_all_params(RandomMat(1));
 }
 
 int main()
 {
     SRAND(7767517);
 
-    return test_squeeze_0();
+    return test_squeeze_0() || test_squeeze_1() || test_squeeze_2() || test_squeeze_3();
 }
diff --git a/tests/test_squeezenet.cpp b/tests/test_squeezenet.cpp
index 81789d26a72..07788c8edd1 100644
--- a/tests/test_squeezenet.cpp
+++ b/tests/test_squeezenet.cpp
@@ -177,6 +177,16 @@ static int test_squeezenet(const ncnn::Option& opt, int load_model_type, float e
     {
         // load from plain model file
         squeezenet.load_param(MODEL_DIR "/squeezenet_v1.1.param");
+
+        // test random feature disabled bits
+        {
+            std::vector<ncnn::Layer*>& layers = squeezenet.mutable_layers();
+            for (size_t i = 0; i < layers.size(); i++)
+            {
+                layers[i]->featmask = i * 11 % 128;
+            }
+        }
+
         squeezenet.load_model(MODEL_DIR "/squeezenet_v1.1.bin");
     }
     if (load_model_type == 1)
diff --git a/tests/test_unfold.cpp b/tests/test_unfold.cpp
new file mode 100644
index 00000000000..4eea1d020ea
--- /dev/null
+++ b/tests/test_unfold.cpp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/unfold.h"
+#include "testutil.h"
+
+static int test_unfold(int w, int h, int c, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h, float pad_value)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(1, kernel_w);
+    pd.set(11, kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_w);
+    pd.set(14, pad_h);
+    pd.set(18, pad_value);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::Unfold>("Unfold", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_unfold failed w=%d h=%d c=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d pad_value=%f\n", w, h, c, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h, pad_value);
+    }
+
+    return ret;
+}
+
+static int test_unfold_0()
+{
+    return 0
+           || test_unfold(32, 32, 11, 3, 3, 1, 1, 1, 1, 0, 0, 0.f)
+           || test_unfold(32, 32, 12, 4, 2, 1, 1, 1, 2, 2, 2, -0.5f)
+           || test_unfold(32, 32, 16, 3, 2, 2, 1, 1, 1, 4, 2, 2.f);
+}
+
+static int test_unfold_1()
+{
+    return 0
+           || test_unfold(32, 32, 11, 3, 3, 1, 1, 1, 1, -233, -233, -0.5f)
+           || test_unfold(32, 32, 12, 4, 2, 1, 1, 1, 2, -234, -234, 0.f)
+           || test_unfold(32, 32, 16, 3, 2, 2, 1, 1, 1, -233, -233, 1.f);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_unfold_0() || test_unfold_1();
+}
diff --git a/toolchains/loongarch64-linux-gnu.toolchain.cmake b/toolchains/loongarch64-linux-gnu.toolchain.cmake
index 4390155f2b4..7cdfd9dbef8 100644
--- a/toolchains/loongarch64-linux-gnu.toolchain.cmake
+++ b/toolchains/loongarch64-linux-gnu.toolchain.cmake
@@ -1,8 +1,18 @@
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR loongarch64)
 
-set(CMAKE_C_COMPILER "loongarch64-linux-gnu-gcc")
-set(CMAKE_CXX_COMPILER "loongarch64-linux-gnu-g++")
+if(DEFINED ENV{LOONGARCH64_ROOT_PATH})
+    file(TO_CMAKE_PATH $ENV{LOONGARCH64_ROOT_PATH} LOONGARCH64_ROOT_PATH)
+else()
+    message(FATAL_ERROR "LOONGARCH64_ROOT_PATH env must be defined")
+endif()
+
+set(LOONGARCH64_ROOT_PATH ${LOONGARCH64_ROOT_PATH} CACHE STRING "root path to loongarch64 toolchain")
+
+set(CMAKE_C_COMPILER "${LOONGARCH64_ROOT_PATH}/bin/loongarch64-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "${LOONGARCH64_ROOT_PATH}/bin/loongarch64-linux-gnu-g++")
+
+set(CMAKE_FIND_ROOT_PATH "${LOONGARCH64_ROOT_PATH}/loongarch64-linux-gnu")
 
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
diff --git a/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake
new file mode 100644
index 00000000000..953f21aaf95
--- /dev/null
+++ b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+if(DEFINED ENV{RISCV_ROOT_PATH})
+    file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+else()
+    message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+endif()
+
+set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv gnu toolchain")
+
+set(CMAKE_C_COMPILER "clang")
+set(CMAKE_CXX_COMPILER "clang++")
+set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
+
+set(CMAKE_C_COMPILER_TARGET "riscv64-unknown-linux-gnu")
+set(CMAKE_CXX_COMPILER_TARGET "riscv64-unknown-linux-gnu")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# add --ld-path=${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-ld or append $RISCV_ROOT_PATH/bin to PATH.
+set(CMAKE_C_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc")
+set(CMAKE_CXX_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc")
+
+# cache flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
diff --git a/toolchains/th1520-v240.toolchain.cmake b/toolchains/th1520-v240.toolchain.cmake
new file mode 100644
index 00000000000..fb9787a8263
--- /dev/null
+++ b/toolchains/th1520-v240.toolchain.cmake
@@ -0,0 +1,31 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+set(C906 True)
+
+if(DEFINED ENV{RISCV_ROOT_PATH})
+    file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+else()
+    message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+endif()
+
+set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
+
+set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++")
+
+set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
+
+set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")
+
+# cache flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+
diff --git a/tools/modelwriter.h b/tools/modelwriter.h
index e9ff979176a..0a53c099da4 100644
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -1569,7 +1569,7 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 1=%d", expand_h)
             fprintf_param_value(" 2=%d", expand_c)
             {
-                if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp);
+                if (!op->axes.empty()) fprintf_param_int_array(3, op->axes, pp);
             }
         }
         else if (layer->type == "GELU")
diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp
index 069e6660c1d..161cf94b272 100644
--- a/tools/onnx/onnx2ncnn.cpp
+++ b/tools/onnx/onnx2ncnn.cpp
@@ -2930,6 +2930,30 @@ static void fuse_binaryop_with_scalar(onnx::GraphProto* mutable_graph, std::map<
     }
 }
 
+// truncate layer/blob names when they exceed 255, which is the upper length limit when parsing param in src/net.cpp
+static std::string trunc_name(std::string name)
+{
+    static int trunc_idx = 0;
+    static std::map<std::string, std::string> name_trunc_map;
+
+    const int max_len = 255;
+    if (name.size() <= max_len)
+    {
+        return name;
+    }
+    if (name_trunc_map.count(name))
+    {
+        return name_trunc_map[name];
+    }
+
+    std::string concat_name = name + "_t" + std::to_string(trunc_idx);
+    std::string trunc_name = concat_name.substr(concat_name.size() - max_len);
+    trunc_idx += 1;
+    name_trunc_map[name] = trunc_name;
+
+    return trunc_name;
+}
+
 int main(int argc, char** argv)
 {
     if (!(argc == 2 || argc == 4))
@@ -3433,7 +3457,7 @@ int main(int argc, char** argv)
         if (weights.find(input_name) != weights.end())
             continue;
 
-        fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", input_name.c_str(), input_name.c_str());
+        fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", trunc_name(input_name).c_str(), trunc_name(input_name).c_str());
 
         int refcount = node_reference[input_name];
         if (refcount <= 1)
@@ -3444,11 +3468,12 @@ int main(int argc, char** argv)
         char splitname[256];
         sprintf(splitname, "splitncnn_input%d", j);
         fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
-        fprintf(pp, " %s", input_name.c_str());
+        fprintf(pp, " %s", trunc_name(input_name).c_str());
 
         for (int k = 0; k < refcount; k++)
         {
-            fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+            std::string split_name = input_name + "_splitncnn_" + std::to_string(k);
+            fprintf(pp, " %s", trunc_name(split_name).c_str());
         }
         fprintf(pp, "\n");
     }
@@ -3464,7 +3489,7 @@ int main(int argc, char** argv)
             continue;
         }
 
-        fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", input_name.c_str(), input_name.c_str());
+        fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", trunc_name(input_name).c_str(), trunc_name(input_name).c_str());
 
         const onnx::TensorProto& M = weights[input_name];
 
@@ -3513,11 +3538,12 @@ int main(int argc, char** argv)
         sprintf(splitname, "splitncnn_%d", internal_split);
         fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
 
-        fprintf(pp, " %s", input_name.c_str());
+        fprintf(pp, " %s", trunc_name(input_name).c_str());
 
         for (int k = 0; k < refcount; k++)
         {
-            fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+            std::string split_name = input_name + "_splitncnn_" + std::to_string(k);
+            fprintf(pp, " %s", trunc_name(split_name).c_str());
         }
         fprintf(pp, "\n");
 
@@ -3939,7 +3965,7 @@ int main(int argc, char** argv)
             fprintf(pp, "%-16s", op.c_str());
         }
 
-        fprintf(pp, " %-24s %d %d", name.c_str(), input_size, output_size);
+        fprintf(pp, " %-24s %d %d", trunc_name(name).c_str(), input_size, output_size);
 
         for (int j = 0; j < (int)node.input_size(); j++)
         {
@@ -3966,14 +3992,14 @@ int main(int argc, char** argv)
                 input_name = input_name + splitsuffix;
             }
 
-            fprintf(pp, " %s", input_name.c_str());
+            fprintf(pp, " %s", trunc_name(input_name).c_str());
         }
 
         for (int j = 0; j < output_size; j++)
         {
             const std::string& output_name = node.output(j);
 
-            fprintf(pp, " %s", output_name.c_str());
+            fprintf(pp, " %s", trunc_name(output_name).c_str());
         }
 
         if (op == "Abs")
@@ -6064,11 +6090,12 @@ int main(int argc, char** argv)
                     sprintf(splitname, "splitncnn_%d", internal_split);
                     fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
 
-                    fprintf(pp, " %s", output_name.c_str());
+                    fprintf(pp, " %s", trunc_name(output_name).c_str());
 
                     for (int k = 0; k < refcount; k++)
                     {
-                        fprintf(pp, " %s_splitncnn_%d", output_name.c_str(), k);
+                        std::string split_name = output_name + "_splitncnn_" + std::to_string(k);
+                        fprintf(pp, " %s", trunc_name(split_name).c_str());
                     }
                     fprintf(pp, "\n");
 
diff --git a/tools/pnnx/README.md b/tools/pnnx/README.md
index f9997e19362..882280d1f43 100644
--- a/tools/pnnx/README.md
+++ b/tools/pnnx/README.md
@@ -62,7 +62,7 @@ mod.save("resnet18.pt")
 pnnx resnet18.pt inputshape=[1,3,224,224]
 ```
 
-Normally, you will get six files
+Normally, you will get seven files
 
 ```resnet18.pnnx.param``` PNNX graph definition
 
@@ -70,6 +70,8 @@ Normally, you will get six files
 
 ```resnet18_pnnx.py``` PyTorch script for inference, the python code for model construction and weight initialization
 
+```resnet18.pnnx.onnx``` PNNX model in onnx format
+
 ```resnet18.ncnn.param``` ncnn graph definition
 
 ```resnet18.ncnn.bin``` ncnn model weight
@@ -87,9 +89,11 @@ Usage: pnnx [model.pt] [(key=value)...]
   pnnxparam=model.pnnx.param
   pnnxbin=model.pnnx.bin
   pnnxpy=model_pnnx.py
+  pnnxonnx=model.pnnx.onnx
   ncnnparam=model.ncnn.param
   ncnnbin=model.ncnn.bin
   ncnnpy=model_ncnn.py
+  fp16=1
   optlevel=2
   device=cpu/gpu
   inputshape=[1,3,224,224],...
@@ -108,12 +112,16 @@ Parameters:
 
 `pnnxpy` (default="*_pnnx.py"): PyTorch script for inference, including model construction and weight initialization code
 
+`pnnxonnx` (default="*.pnnx.onnx"): PNNX model in onnx format
+
 `ncnnparam` (default="*.ncnn.param"): ncnn graph definition
 
 `ncnnbin` (default="*.ncnn.bin"): ncnn model weight
 
 `ncnnpy` (default="*_ncnn.py"): pyncnn script for inference
 
+`fp16` (default=1): save ncnn weight and onnx in fp16 data type
+
 `optlevel` (default=2): graph optimization level 
 
 | Option | Optimization level              |
@@ -484,10 +492,11 @@ TORCH_LIBRARY(upfirdn2d_op, m) {
 |nn.Embedding               | :heavy_check_mark: | :heavy_check_mark: |
 |nn.EmbeddingBag            |   |
 |nn.Flatten                 | :heavy_check_mark: |
-|nn.Fold                    |   |
+|nn.Fold                    | :heavy_check_mark: | :heavy_check_mark: |
 |nn.FractionalMaxPool2d     |   |
 |nn.FractionalMaxPool3d     |   |
 |nn.GELU                    | :heavy_check_mark: | :heavy_check_mark: |
+|nn.GLU                     | :heavy_check_mark: | :heavy_check_mark: |
 |nn.GroupNorm               | :heavy_check_mark: | :heavy_check_mark: |
 |nn.GRU                     | :heavy_check_mark: | :heavy_check_mark: |
 |nn.GRUCell                 |   |
@@ -546,7 +555,7 @@ TORCH_LIBRARY(upfirdn2d_op, m) {
 |nn.Sigmoid                 | :heavy_check_mark: | :heavy_check_mark: |
 |nn.SiLU                    | :heavy_check_mark: | :heavy_check_mark: |
 |nn.Softmax                 | :heavy_check_mark: | :heavy_check_mark: |
-|nn.Softmax2d               |   |
+|nn.Softmax2d               | :heavy_check_mark: | :heavy_check_mark: |
 |nn.Softmin                 | :heavy_check_mark: |
 |nn.Softplus                | :heavy_check_mark: |
 |nn.Softshrink              | :heavy_check_mark: |
@@ -561,7 +570,7 @@ TORCH_LIBRARY(upfirdn2d_op, m) {
 |nn.TransformerEncoder      |   |
 |nn.TransformerEncoderLayer |   |
 |nn.Unflatten               |   |
-|nn.Unfold                  |   |
+|nn.Unfold                  | :heavy_check_mark: | :heavy_check_mark: |
 |nn.Upsample                | :heavy_check_mark: | :heavy_check_mark: |
 |nn.UpsamplingBilinear2d    | :heavy_check_mark: | :heavy_check_mark: |
 |nn.UpsamplingNearest2d     | :heavy_check_mark: | :heavy_check_mark: |
@@ -599,12 +608,12 @@ TORCH_LIBRARY(upfirdn2d_op, m) {
 |F.embedding                | :heavy_check_mark: | :heavy_check_mark: |
 |F.embedding_bag            |  |
 |F.feature_alpha_dropout    | :heavy_check_mark: | :heavy_check_mark: |
-|F.fold                     |  |
+|F.fold                     | :heavy_check_mark: | :heavy_check_mark: |
 |F.fractional_max_pool2d    |  |
 |F.fractional_max_pool3d    |  |
 |F.gelu                     | :heavy_check_mark: | :heavy_check_mark: |
-|F.glu                      |  |
-|F.grid_sample              | :heavy_check_mark: |
+|F.glu                      | :heavy_check_mark: | :heavy_check_mark: |
+|F.grid_sample              | :heavy_check_mark: | :heavy_check_mark: |
 |F.group_norm               | :heavy_check_mark: | :heavy_check_mark: |
 |F.gumbel_softmax           |  |
 |F.hardshrink               | :heavy_check_mark: |
@@ -655,7 +664,7 @@ TORCH_LIBRARY(upfirdn2d_op, m) {
 |F.tanhshrink               | :heavy_check_mark: |
 |F.threshold                | :heavy_check_mark: |
 |F.threshold_               | :heavy_check_mark: |
-|F.unfold                   |  |
+|F.unfold                   | :heavy_check_mark: | :heavy_check_mark: |
 |F.upsample                 | :heavy_check_mark: | :heavy_check_mark: |
 |F.upsample_bilinear        | :heavy_check_mark: | :heavy_check_mark: |
 |F.upsample_nearest         | :heavy_check_mark: | :heavy_check_mark: |
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index f9e29f7fe2c..9005df1ecc9 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -4,6 +4,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 set(pnnx_pass_level0_SRCS
     pass_level0/constant_unpooling.cpp
     pass_level0/inline_block.cpp
+    pass_level0/reset_device.cpp
     pass_level0/shape_inference.cpp
 )
 
@@ -37,7 +38,9 @@ set(pnnx_pass_level1_SRCS
     pass_level1/nn_Dropout3d.cpp
     pass_level1/nn_ELU.cpp
     pass_level1/nn_Embedding.cpp
+    pass_level1/nn_Fold.cpp
     pass_level1/nn_GELU.cpp
+    pass_level1/nn_GLU.cpp
     pass_level1/nn_GroupNorm.cpp
     pass_level1/nn_GRU.cpp
     pass_level1/nn_Hardshrink.cpp
@@ -78,6 +81,7 @@ set(pnnx_pass_level1_SRCS
     pass_level1/nn_Sigmoid.cpp
     pass_level1/nn_SiLU.cpp
     pass_level1/nn_Softmax.cpp
+    pass_level1/nn_Softmax2d.cpp
     pass_level1/nn_Softmin.cpp
     pass_level1/nn_Softplus.cpp
     pass_level1/nn_Softshrink.cpp
@@ -85,6 +89,7 @@ set(pnnx_pass_level1_SRCS
     pass_level1/nn_Tanh.cpp
     pass_level1/nn_Tanhshrink.cpp
     pass_level1/nn_Threshold.cpp
+    pass_level1/nn_Unfold.cpp
     pass_level1/nn_Upsample.cpp
     pass_level1/nn_UpsamplingBilinear2d.cpp
     pass_level1/nn_UpsamplingNearest2d.cpp
@@ -126,6 +131,7 @@ set(pnnx_pass_level2_SRCS
     pass_level2/F_elu.cpp
     pass_level2/F_embedding.cpp
     pass_level2/F_feature_alpha_dropout.cpp
+    pass_level2/F_fold.cpp
     pass_level2/F_gelu.cpp
     pass_level2/F_glu.cpp
     pass_level2/F_grid_sample.cpp
@@ -167,10 +173,12 @@ set(pnnx_pass_level2_SRCS
     pass_level2/F_tanh.cpp
     pass_level2/F_tanhshrink.cpp
     pass_level2/F_threshold.cpp
+    pass_level2/F_unfold.cpp
     pass_level2/F_upsample_bilinear.cpp
     pass_level2/F_upsample_nearest.cpp
     pass_level2/F_upsample.cpp
     pass_level2/Tensor_contiguous.cpp
+    pass_level2/Tensor_copy.cpp
     pass_level2/Tensor_expand.cpp
     pass_level2/Tensor_expand_as.cpp
     pass_level2/Tensor_index.cpp
@@ -194,6 +202,8 @@ set(pnnx_pass_level2_SRCS
     pass_level2/torch_bitwise_and.cpp
     pass_level2/torch_bitwise_or.cpp
     pass_level2/torch_bitwise_xor.cpp
+    pass_level2/torch_bitwise_left_shift.cpp
+    pass_level2/torch_bitwise_right_shift.cpp
     pass_level2/torch_cat.cpp
     pass_level2/torch_chunk.cpp
     pass_level2/torch_clamp.cpp
@@ -300,10 +310,11 @@ set(pnnx_pass_level5_SRCS
     pass_level5/eliminate_noop_expression.cpp
     pass_level5/eliminate_noop_pad.cpp
     pass_level5/eliminate_noop_upsample.cpp
-    pass_level5/eliminate_slice.cpp
-    pass_level5/eliminate_view_reshape.cpp
+    pass_level5/eliminate_noop_slice.cpp
+    pass_level5/eliminate_noop_view_reshape.cpp
     pass_level5/eval_expression.cpp
     pass_level5/fold_constants.cpp
+    pass_level5/fuse_adjacent_reshape.cpp
     pass_level5/fuse_channel_shuffle.cpp
     pass_level5/fuse_constant_expression.cpp
     pass_level5/fuse_conv1d_batchnorm1d.cpp
@@ -312,10 +323,19 @@ set(pnnx_pass_level5_SRCS
     pass_level5/fuse_convtranspose2d_batchnorm2d.cpp
     pass_level5/fuse_contiguous_view.cpp
     pass_level5/fuse_linear_batchnorm1d.cpp
+    pass_level5/fuse_pad_conv1d.cpp
+    pass_level5/fuse_pad_conv2d.cpp
     pass_level5/fuse_select_to_unbind.cpp
+    pass_level5/fuse_slice_copy.cpp
     pass_level5/fuse_slice_indices.cpp
     pass_level5/fuse_slice_to_tensor_split.cpp
+    pass_level5/fuse_static_batchnorm.cpp
     pass_level5/fuse_static_conv.cpp
+    pass_level5/fuse_static_convtranspose.cpp
+    pass_level5/fuse_static_groupnorm.cpp
+    pass_level5/fuse_static_instancenorm.cpp
+    pass_level5/fuse_static_layernorm.cpp
+    pass_level5/fuse_static_linear.cpp
     pass_level5/normalize_einsum_equation.cpp
     pass_level5/unroll_rnn_op.cpp
 )
@@ -338,7 +358,6 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/insert_split.cpp
     pass_ncnn/chain_multi_output.cpp
     pass_ncnn/solve_batch_index.cpp
-    pass_ncnn/convert_to_fp16_model.cpp
 
     pass_ncnn/eliminate_noop.cpp
     pass_ncnn/eliminate_tail_reshape_permute.cpp
@@ -373,8 +392,10 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/F_conv3d.cpp
     pass_ncnn/F_elu.cpp
     pass_ncnn/F_embedding.cpp
+    pass_ncnn/F_fold.cpp
     pass_ncnn/F_gelu.cpp
     pass_ncnn/F_glu.cpp
+    pass_ncnn/F_grid_sample.cpp
     pass_ncnn/F_group_norm.cpp
     pass_ncnn/F_hardsigmoid.cpp
     pass_ncnn/F_hardswish.cpp
@@ -401,6 +422,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/F_silu.cpp
     pass_ncnn/F_softmax.cpp
     pass_ncnn/F_tanh.cpp
+    pass_ncnn/F_unfold.cpp
     pass_ncnn/F_upsample_bilinear.cpp
     pass_ncnn/F_upsample_nearest.cpp
     pass_ncnn/F_upsample.cpp
@@ -428,7 +450,9 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/nn_ConvTranspose3d.cpp
     pass_ncnn/nn_ELU.cpp
     pass_ncnn/nn_Embedding.cpp
+    pass_ncnn/nn_Fold.cpp
     pass_ncnn/nn_GELU.cpp
+    pass_ncnn/nn_GLU.cpp
     pass_ncnn/nn_GroupNorm.cpp
     pass_ncnn/nn_GRU.cpp
     pass_ncnn/nn_Hardsigmoid.cpp
@@ -459,7 +483,9 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/nn_Sigmoid.cpp
     pass_ncnn/nn_SiLU.cpp
     pass_ncnn/nn_Softmax.cpp
+    pass_ncnn/nn_Softmax2d.cpp
     pass_ncnn/nn_Tanh.cpp
+    pass_ncnn/nn_Unfold.cpp
     pass_ncnn/nn_Upsample.cpp
     pass_ncnn/nn_UpsamplingBilinear2d.cpp
     pass_ncnn/nn_UpsamplingNearest2d.cpp
@@ -495,6 +521,27 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/icefall_RelShift.cpp
 )
 
+find_package(Protobuf)
+if(PROTOBUF_FOUND)
+    protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto)
+
+    add_library(pnnx2onnx STATIC
+        save_onnx.cpp
+        save_onnx_cxxabi_bridge.cpp
+        ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS}
+    )
+
+    target_include_directories(pnnx2onnx PRIVATE ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+    target_link_libraries(pnnx2onnx PRIVATE ${PROTOBUF_LIBRARIES})
+
+    # libtorch is usually compiled with old cxx11 abi
+    set_source_files_properties(save_onnx_cxxabi_bridge.cpp PROPERTIES COMPILE_FLAGS "${TORCH_CXX_FLAGS}")
+
+    message(STATUS "Building with onnx-zero")
+else()
+    message(STATUS "Building without onnx-zero")
+endif()
+
 set(pnnx_SRCS
     main.cpp
     ir.cpp
@@ -508,8 +555,6 @@ set(pnnx_SRCS
     pass_level4.cpp
     pass_level5.cpp
 
-    pass_ncnn.cpp
-
     ${pnnx_pass_level0_SRCS}
     ${pnnx_pass_level1_SRCS}
     ${pnnx_pass_level2_SRCS}
@@ -517,6 +562,8 @@ set(pnnx_SRCS
     ${pnnx_pass_level4_SRCS}
     ${pnnx_pass_level5_SRCS}
 
+    pass_ncnn.cpp
+    save_ncnn.cpp
     ${pnnx_pass_ncnn_SRCS}
 )
 
@@ -526,6 +573,8 @@ endif()
 
 add_executable(pnnx ${pnnx_SRCS})
 
+target_compile_definitions(pnnx PRIVATE BUILD_PNNX)
+
 if(PNNX_COVERAGE)
     target_compile_options(pnnx PUBLIC -coverage -fprofile-arcs -ftest-coverage)
     target_link_libraries(pnnx PUBLIC -coverage -lgcov)
@@ -535,6 +584,11 @@ if(WIN32)
     target_compile_definitions(pnnx PUBLIC NOMINMAX)
 endif()
 
+if(PROTOBUF_FOUND)
+    target_compile_definitions(pnnx PRIVATE BUILD_PNNX2ONNX)
+    target_link_libraries(pnnx PRIVATE pnnx2onnx)
+endif()
+
 if(TorchVision_FOUND)
     target_link_libraries(pnnx PRIVATE TorchVision::TorchVision)
 endif()
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 86cdd75f720..062092fe9c4 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -14,14 +14,18 @@
 
 #include "ir.h"
 
+#include <limits.h>
 #include <stdint.h>
+#include <string.h>
 #include <algorithm>
 #include <fstream>
 #include <sstream>
 #include <string>
 #include <stack>
 
+#if BUILD_PNNX
 #include <torch/script.h>
+#endif
 
 #include "storezip.h"
 
@@ -129,6 +133,7 @@ static int string_to_type(const char* s)
     return 0; // null
 }
 
+#if BUILD_PNNX
 int get_at_tensor_type(const at::ScalarType& st)
 {
     if (st == c10::ScalarType::Float) return 1;
@@ -177,7 +182,10 @@ Parameter::Parameter(const torch::jit::Node* value_node)
         case c10::TypeKind::IntType:
         {
             type = 2;
-            i = (int)value_node->i(torch::jit::attr::value);
+            int64_t i64 = value_node->i(torch::jit::attr::value);
+            if (i64 == LONG_MAX) i64 = INT_MAX;
+            if (i64 == LONG_MIN) i64 = INT_MIN;
+            i = (int)i64;
             break;
         }
         case c10::TypeKind::FloatType:
@@ -201,7 +209,10 @@ Parameter::Parameter(const torch::jit::Node* value_node)
                 if (t.scalar_type() == c10::ScalarType::Long)
                 {
                     type = 2;
-                    i = (int)t.item<int64_t>();
+                    int64_t i64 = t.item<int64_t>();
+                    if (i64 == LONG_MAX) i64 = INT_MAX;
+                    if (i64 == LONG_MIN) i64 = INT_MIN;
+                    i = (int)i64;
                 }
                 else if (t.scalar_type() == c10::ScalarType::Int)
                 {
@@ -288,6 +299,7 @@ Parameter::Parameter(const torch::jit::Value* value)
     : Parameter(value->node())
 {
 }
+#endif // BUILD_PNNX
 
 bool operator==(const Parameter& lhs, const Parameter& rhs)
 {
@@ -321,6 +333,7 @@ bool operator==(const Parameter& lhs, const Parameter& rhs)
     return false;
 }
 
+#if BUILD_PNNX
 Attribute::Attribute(const at::Tensor& t)
 {
     type = get_at_tensor_type(t.scalar_type());
@@ -377,6 +390,7 @@ Attribute::Attribute(const at::Tensor& t)
         memcpy((void*)data.data(), (const void*)t.cpu().contiguous().data_ptr(), data.size());
     }
 }
+#endif // BUILD_PNNX
 
 Attribute::Attribute(const std::initializer_list<int>& _shape, const std::vector<float>& t)
 {
@@ -1046,14 +1060,58 @@ static std::string expand_expression(const Operator* op)
             std::string r = a + ".size(" + b + ")";
             exprstack.push(r);
         }
-        else if (t == "int" || t == "sqrt" || t == "rsqrt" || t == "neg" || t == "floor")
+        else if (t == "int"
+                 || t == "abs"
+                 || t == "acos"
+                 || t == "acosh"
+                 || t == "asin"
+                 || t == "asinh"
+                 || t == "atan"
+                 || t == "atanh"
+                 || t == "ceil"
+                 || t == "cos"
+                 || t == "cosh"
+                 || t == "exp"
+                 || t == "floor"
+                 || t == "log"
+                 || t == "neg"
+                 || t == "reciprocal"
+                 || t == "rsqrt"
+                 || t == "sign"
+                 || t == "sin"
+                 || t == "sinh"
+                 || t == "sqrt"
+                 || t == "square"
+                 || t == "tan"
+                 || t == "tanh"
+                 || t == "trunc")
         {
             std::string unaryop;
             if (t == "int") unaryop = "int";
-            if (t == "sqrt") unaryop = "torch.sqrt";
-            if (t == "rsqrt") unaryop = "torch.rsqrt";
-            if (t == "neg") unaryop = "torch.neg";
+            if (t == "abs") unaryop = "torch.abs";
+            if (t == "acos") unaryop = "torch.acos";
+            if (t == "acosh") unaryop = "torch.acosh";
+            if (t == "asin") unaryop = "torch.asin";
+            if (t == "asinh") unaryop = "torch.asinh";
+            if (t == "atan") unaryop = "torch.atan";
+            if (t == "atanh") unaryop = "torch.atanh";
+            if (t == "ceil") unaryop = "torch.ceil";
+            if (t == "cos") unaryop = "torch.cos";
+            if (t == "cosh") unaryop = "torch.cosh";
+            if (t == "exp") unaryop = "torch.exp";
             if (t == "floor") unaryop = "torch.floor";
+            if (t == "log") unaryop = "torch.log";
+            if (t == "neg") unaryop = "torch.neg";
+            if (t == "reciprocal") unaryop = "torch.reciprocal";
+            if (t == "rsqrt") unaryop = "torch.rsqrt";
+            if (t == "sign") unaryop = "torch.sign";
+            if (t == "sin") unaryop = "torch.sin";
+            if (t == "sinh") unaryop = "torch.sinh";
+            if (t == "sqrt") unaryop = "torch.sqrt";
+            if (t == "square") unaryop = "torch.square";
+            if (t == "tan") unaryop = "torch.tan";
+            if (t == "tanh") unaryop = "torch.tanh";
+            if (t == "trunc") unaryop = "torch.trunc";
 
             std::string a = exprstack.top();
             exprstack.pop();
@@ -1061,17 +1119,22 @@ static std::string expand_expression(const Operator* op)
             std::string r = unaryop + "(" + a + ")";
             exprstack.push(r);
         }
-        else if (t == "pow")
+        else if (t == "atan2"
+                 || t == "pow")
         {
+            std::string binaryop;
+            if (t == "atan2") binaryop = "torch.atan2";
+            if (t == "pow") binaryop = "torch.pow";
+
             std::string a = exprstack.top();
             exprstack.pop();
             std::string b = exprstack.top();
             exprstack.pop();
 
-            std::string r = a + ".pow(" + b + ")";
+            std::string r = binaryop + "(" + a + ", " + b + ")";
             exprstack.push(r);
         }
-        else if (t == "add" || t == "sub" || t == "mul" || t == "div" || t == "floor_divide" || t == "and" || t == "or" || t == "xor")
+        else if (t == "add" || t == "sub" || t == "mul" || t == "div" || t == "floor_divide" || t == "and" || t == "or" || t == "xor" || t == "lshift" || t == "rshift")
         {
             std::string binaryop;
             if (t == "add") binaryop = "+";
@@ -1082,6 +1145,8 @@ static std::string expand_expression(const Operator* op)
             if (t == "and") binaryop = "&";
             if (t == "or") binaryop = "|";
             if (t == "xor") binaryop = "^";
+            if (t == "lshift") binaryop = "<<";
+            if (t == "rshift") binaryop = ">>";
 
             std::string a = exprstack.top();
             exprstack.pop();
@@ -1196,7 +1261,7 @@ static std::string make_slice_expression(const Operator* op)
         {
             std::vector<int> ends = op->params.at("ends").ai;
             int end = ends[i];
-            if (end != -1)
+            if (end != INT_MAX)
                 r += std::to_string(end);
         }
         else
@@ -1283,9 +1348,9 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
     fprintf(pyfp, "import torch.nn as nn\n");
     fprintf(pyfp, "import torch.nn.functional as F\n");
     fprintf(pyfp, "try:\n");
-    fprintf(pyfp, "\timport torchvision\n");
+    fprintf(pyfp, "    import torchvision\n");
     fprintf(pyfp, "except:\n");
-    fprintf(pyfp, "\tpass\n");
+    fprintf(pyfp, "    pass\n");
 
     fprintf(pyfp, "\n");
 
@@ -1595,6 +1660,13 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                 std::string slice_expr = make_slice_expression(op);
                 fprintf(pyfp, "v_%s = v_%s[%s]\n", sanitize_identifier(op->outputs[0]->name).c_str(), sanitize_identifier(op->inputs[0]->name).c_str(), slice_expr.c_str());
             }
+            else if (op->type == "Tensor.slice_copy")
+            {
+                // slice copy expr
+                std::string slice_expr = make_slice_expression(op);
+                fprintf(pyfp, "v_%s = v_%s\n", sanitize_identifier(op->outputs[0]->name).c_str(), sanitize_identifier(op->inputs[0]->name).c_str());
+                fprintf(pyfp, "        v_%s[%s] = v_%s\n", sanitize_identifier(op->outputs[0]->name).c_str(), slice_expr.c_str(), sanitize_identifier(op->inputs[1]->name).c_str());
+            }
             else if (op->type == "Tensor.index")
             {
                 // index expr
@@ -1762,8 +1834,14 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                 fprintf(pyfp, " = self.%s(", sanitize_identifier(op->name).c_str());
                 if (op->inputs.size() == 1)
                 {
-                    const char* in0 = sanitize_identifier(op->inputs[0]->name).c_str();
-                    fprintf(pyfp, "v_%s, v_%s, v_%s", in0, in0, in0);
+                    std::string in0 = sanitize_identifier(op->inputs[0]->name);
+                    fprintf(pyfp, "v_%s, v_%s, v_%s", in0.c_str(), in0.c_str(), in0.c_str());
+                }
+                else if (op->inputs.size() == 2)
+                {
+                    std::string in0 = sanitize_identifier(op->inputs[0]->name);
+                    std::string in1 = sanitize_identifier(op->inputs[1]->name);
+                    fprintf(pyfp, "v_%s, v_%s, v_%s", in0.c_str(), in1.c_str(), in1.c_str());
                 }
                 else
                 {
@@ -2233,314 +2311,6 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
     return 0;
 }
 
-static bool string_is_positive_integer(const std::string& t)
-{
-    for (size_t i = 0; i < t.size(); i++)
-    {
-        if (t[i] < '0' || t[i] > '9')
-            return false;
-    }
-
-    return true;
-}
-
-int Graph::ncnn(const std::string& parampath, const std::string& binpath, const std::string& pypath)
-{
-    FILE* paramfp = fopen(parampath.c_str(), "wb");
-    if (!paramfp)
-    {
-        fprintf(stderr, "fopen %s failed\n", parampath.c_str());
-        return -1;
-    }
-
-    FILE* binfp = fopen(binpath.c_str(), "wb");
-    if (!binfp)
-    {
-        fprintf(stderr, "fopen %s failed\n", binpath.c_str());
-        fclose(paramfp);
-        return -1;
-    }
-
-    // magic
-    fprintf(paramfp, "7767517\n");
-
-    // op count and oprand count
-    fprintf(paramfp, "%d %d\n", (int)ops.size(), (int)operands.size());
-
-    for (const Operator* op : ops)
-    {
-        fprintf(paramfp, "%-24s %-24s %d %d", op->type.c_str(), op->name.c_str(), (int)op->inputs.size(), (int)op->outputs.size());
-
-        for (const Operand* oprand : op->inputs)
-        {
-            fprintf(paramfp, " %s", oprand->name.c_str());
-        }
-
-        for (const Operand* oprand : op->outputs)
-        {
-            fprintf(paramfp, " %s", oprand->name.c_str());
-        }
-
-        for (const auto& it : op->params)
-        {
-            const Parameter& param = it.second;
-
-            if (!string_is_positive_integer(it.first))
-            {
-                fprintf(stderr, "ignore %s %s param %s=", op->type.c_str(), op->name.c_str(), it.first.c_str());
-
-                if (param.type == 0)
-                {
-                    fprintf(stderr, "None");
-                }
-                if (param.type == 1)
-                {
-                    if (param.b)
-                        fprintf(stderr, "True");
-                    else
-                        fprintf(stderr, "False");
-                }
-                if (param.type == 2)
-                {
-                    fprintf(stderr, "%d", param.i);
-                }
-                if (param.type == 3)
-                {
-                    fprintf(stderr, "%e", param.f);
-                }
-                if (param.type == 4)
-                {
-                    fprintf(stderr, "%s", param.s.c_str());
-                }
-                if (param.type == 5)
-                {
-                    fprintf(stderr, "(");
-                    for (size_t i = 0; i < param.ai.size(); i++)
-                    {
-                        fprintf(stderr, "%d", param.ai[i]);
-                        if (i + 1 != param.ai.size())
-                            fprintf(stderr, ",");
-                    }
-                    fprintf(stderr, ")");
-                }
-                if (param.type == 6)
-                {
-                    fprintf(stderr, "(");
-                    for (size_t i = 0; i < param.af.size(); i++)
-                    {
-                        fprintf(stderr, "%e", param.af[i]);
-                        if (i + 1 != param.af.size())
-                            fprintf(stderr, ",");
-                    }
-                    fprintf(stderr, ")");
-                }
-                if (param.type == 7)
-                {
-                    fprintf(stderr, "(");
-                    for (size_t i = 0; i < param.as.size(); i++)
-                    {
-                        fprintf(stderr, "%s", param.as[i].c_str());
-                        if (i + 1 != param.as.size())
-                            fprintf(stderr, ",");
-                    }
-                    fprintf(stderr, ")");
-                }
-                fprintf(stderr, "\n");
-
-                continue;
-            }
-
-            const int idkey = std::stoi(it.first);
-            if (param.type == 2)
-            {
-                fprintf(paramfp, " %d=%d", idkey, param.i);
-            }
-            if (param.type == 3)
-            {
-                fprintf(paramfp, " %d=%e", idkey, param.f);
-            }
-            if (param.type == 5)
-            {
-                const int array_size = (int)param.ai.size();
-                fprintf(paramfp, " %d=%d", -23300 - idkey, array_size);
-                for (size_t i = 0; i < param.ai.size(); i++)
-                {
-                    fprintf(paramfp, ",%d", param.ai[i]);
-                }
-            }
-            if (param.type == 6)
-            {
-                const int array_size = (int)param.af.size();
-                fprintf(paramfp, " %d=%d", -23300 - idkey, array_size);
-                for (size_t i = 0; i < param.af.size(); i++)
-                {
-                    fprintf(paramfp, ",%e", param.af[i]);
-                }
-            }
-        }
-
-        for (const auto& it : op->attrs)
-        {
-            //             fprintf(paramfp, " @%s=", it.first.c_str());
-
-            const Attribute& attr = it.second;
-
-            fwrite(attr.data.data(), attr.data.size(), 1, binfp);
-        }
-
-        //         if (op->inputnames.size() == op->inputs.size())
-        //         {
-        //             for (size_t i = 0; i < op->inputs.size(); i++)
-        //             {
-        //                 const Operand* oprand = op->inputs[i];
-        //                 fprintf(paramfp, " $%s=%s", op->inputnames[i].c_str(), oprand->name.c_str());
-        //             }
-        //         }
-
-        //         for (const Operand* oprand : op->outputs)
-        //         {
-        //             if (oprand->params.find("__batch_index") == oprand->params.end())
-        //                 continue;
-        //
-        //             const int batch_index = oprand->params.at("__batch_index").i;
-        //
-        //             fprintf(paramfp, " #%s=%d", oprand->name.c_str(), batch_index);
-        //         }
-
-        //         for (const Operand* oprand : op->outputs)
-        //         {
-        //             if (oprand->shape.empty())
-        //                 continue;
-        //
-        //             fprintf(paramfp, " #%s=", oprand->name.c_str());
-        //
-        //             fprintf(paramfp, "(");
-        //             for (int64_t i = 0; i < oprand->shape.size() - 1; i++)
-        //             {
-        //                 fprintf(paramfp, "%d,", oprand->shape[i]);
-        //             }
-        //             if (oprand->shape.size() > 0)
-        //                 fprintf(paramfp, "%d", oprand->shape[oprand->shape.size() - 1]);
-        //             fprintf(paramfp, ")");
-        //
-        //             fprintf(paramfp, type_to_string(oprand->type));
-        //         }
-
-        fprintf(paramfp, "\n");
-    }
-
-    fclose(paramfp);
-    fclose(binfp);
-
-    FILE* pyfp = fopen(pypath.c_str(), "wb");
-    if (!pyfp)
-    {
-        fprintf(stderr, "fopen %s failed\n", pypath.c_str());
-        return -1;
-    }
-
-    fprintf(pyfp, "import numpy as np\n");
-    fprintf(pyfp, "import ncnn\n");
-    fprintf(pyfp, "import torch\n");
-
-    fprintf(pyfp, "\n");
-
-    // test inference
-    {
-        fprintf(pyfp, "def test_inference():\n");
-        fprintf(pyfp, "    torch.manual_seed(0)\n");
-
-        for (int input_index = 0;; input_index++)
-        {
-            std::string input_name = std::string("in") + std::to_string(input_index);
-            const Operand* r = get_operand(input_name);
-            if (!r)
-                break;
-
-            if (type_is_integer(r->type))
-            {
-                fprintf(pyfp, "    %s = torch.randint(10, (", input_name.c_str());
-                for (size_t i = 0; i < r->shape.size(); i++)
-                {
-                    fprintf(pyfp, "%d", r->shape[i]);
-                    if (i + 1 != r->shape.size() || r->shape.size() == 1)
-                        fprintf(pyfp, ", ");
-                }
-                fprintf(pyfp, "), dtype=%s)\n", type_to_dtype_string(r->type));
-            }
-            else
-            {
-                fprintf(pyfp, "    %s = torch.rand(", input_name.c_str());
-                for (size_t i = 0; i < r->shape.size(); i++)
-                {
-                    fprintf(pyfp, "%d, ", r->shape[i]);
-                }
-                fprintf(pyfp, "dtype=%s)\n", type_to_dtype_string(r->type));
-            }
-        }
-
-        fprintf(pyfp, "    out = []\n");
-        fprintf(pyfp, "\n");
-
-        fprintf(pyfp, "    with ncnn.Net() as net:\n");
-        fprintf(pyfp, "         net.load_param(\"%s\")\n", parampath.c_str());
-        fprintf(pyfp, "         net.load_model(\"%s\")\n", binpath.c_str());
-        fprintf(pyfp, "\n");
-        fprintf(pyfp, "         with net.create_extractor() as ex:\n");
-
-        for (int input_index = 0;; input_index++)
-        {
-            std::string input_name = std::string("in") + std::to_string(input_index);
-            const Operand* r = get_operand(input_name);
-            if (!r)
-                break;
-
-            const int batch_index = r->params.at("__batch_index").i;
-            if (batch_index != 233)
-            {
-                fprintf(pyfp, "            ex.input(\"%s\", ncnn.Mat(%s.squeeze(%d).numpy()).clone())\n", input_name.c_str(), input_name.c_str(), batch_index);
-            }
-            else
-            {
-                fprintf(pyfp, "            ex.input(\"%s\", ncnn.Mat(%s.numpy()).clone())\n", input_name.c_str(), input_name.c_str());
-            }
-        }
-
-        fprintf(pyfp, "\n");
-
-        for (int output_index = 0;; output_index++)
-        {
-            std::string output_name = std::string("out") + std::to_string(output_index);
-            const Operand* r = get_operand(output_name);
-            if (!r)
-                break;
-
-            fprintf(pyfp, "            _, %s = ex.extract(\"%s\")\n", output_name.c_str(), output_name.c_str());
-
-            const int batch_index = r->params.at("__batch_index").i;
-            if (batch_index != 233)
-            {
-                fprintf(pyfp, "            out.append(torch.from_numpy(np.array(%s)).unsqueeze(%d))\n", output_name.c_str(), batch_index);
-            }
-            else
-            {
-                fprintf(pyfp, "            out.append(torch.from_numpy(np.array(%s)))\n", output_name.c_str());
-            }
-        }
-
-        fprintf(pyfp, "\n");
-
-        fprintf(pyfp, "    if len(out) == 1:\n");
-        fprintf(pyfp, "        return out[0]\n");
-        fprintf(pyfp, "    else:\n");
-        fprintf(pyfp, "        return tuple(out)\n");
-    }
-
-    fclose(pyfp);
-
-    return 0;
-}
-
 int Graph::parse(const std::string& param)
 {
     std::istringstream is(param);
@@ -2675,6 +2445,7 @@ Operator* Graph::new_operator_after(const std::string& type, const std::string&
     return op;
 }
 
+#if BUILD_PNNX
 Operand* Graph::new_operand(const torch::jit::Value* v)
 {
     Operand* r = new Operand;
@@ -2701,6 +2472,7 @@ Operand* Graph::new_operand(const torch::jit::Value* v)
     operands.push_back(r);
     return r;
 }
+#endif // BUILD_PNNX
 
 Operand* Graph::new_operand(const std::string& name)
 {
@@ -2721,4 +2493,15 @@ Operand* Graph::get_operand(const std::string& name)
     return 0;
 }
 
+const Operand* Graph::get_operand(const std::string& name) const
+{
+    for (const Operand* r : operands)
+    {
+        if (r->name == name)
+            return r;
+    }
+
+    return 0;
+}
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/ir.h b/tools/pnnx/src/ir.h
index 06fe09c14bd..740e40192cc 100644
--- a/tools/pnnx/src/ir.h
+++ b/tools/pnnx/src/ir.h
@@ -17,9 +17,11 @@
 
 #include <initializer_list>
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
+#if BUILD_PNNX
 namespace torch {
 namespace jit {
 struct Value;
@@ -29,6 +31,7 @@ struct Node;
 namespace at {
 class Tensor;
 }
+#endif // BUILD_PNNX
 
 namespace pnnx {
 
@@ -114,8 +117,10 @@ class Parameter
     {
     }
 
+#if BUILD_PNNX
     Parameter(const torch::jit::Node* value_node);
     Parameter(const torch::jit::Value* value);
+#endif // BUILD_PNNX
 
     static Parameter parse_from_string(const std::string& value);
 
@@ -126,9 +131,11 @@ class Parameter
     bool b;
     int i;
     float f;
-    std::string s;
     std::vector<int> ai;
     std::vector<float> af;
+
+    // keep std::string typed member the last for cross cxxabi compatibility
+    std::string s;
     std::vector<std::string> as;
 };
 
@@ -142,7 +149,9 @@ class Attribute
     {
     }
 
+#if BUILD_PNNX
     Attribute(const at::Tensor& t);
+#endif // BUILD_PNNX
 
     Attribute(const std::initializer_list<int>& shape, const std::vector<float>& t);
 
@@ -164,8 +173,6 @@ class Operand
 public:
     void remove_consumer(const Operator* c);
 
-    std::string name;
-
     Operator* producer;
     std::vector<Operator*> consumers;
 
@@ -173,6 +180,9 @@ class Operand
     int type;
     std::vector<int> shape;
 
+    // keep std::string typed member the last for cross cxxabi compatibility
+    std::string name;
+
     std::map<std::string, Parameter> params;
 
 private:
@@ -185,12 +195,13 @@ class Operand
 class Operator
 {
 public:
-    std::string type;
-    std::string name;
-
     std::vector<Operand*> inputs;
     std::vector<Operand*> outputs;
 
+    // keep std::string typed member the last for cross cxxabi compatibility
+    std::string type;
+    std::string name;
+
     std::vector<std::string> inputnames;
     std::map<std::string, Parameter> params;
     std::map<std::string, Attribute> attrs;
@@ -213,8 +224,6 @@ class Graph
 
     int python(const std::string& pypath, const std::string& binpath);
 
-    int ncnn(const std::string& parampath, const std::string& binpath, const std::string& pypath);
-
     int parse(const std::string& param);
 
     Operator* new_operator(const std::string& type, const std::string& name);
@@ -223,11 +232,14 @@ class Graph
 
     Operator* new_operator_after(const std::string& type, const std::string& name, const Operator* cur);
 
+#if BUILD_PNNX
     Operand* new_operand(const torch::jit::Value* v);
+#endif
 
     Operand* new_operand(const std::string& name);
 
     Operand* get_operand(const std::string& name);
+    const Operand* get_operand(const std::string& name) const;
 
     std::vector<Operator*> ops;
     std::vector<Operand*> operands;
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index 87ecfecd648..98066c4c547 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -39,6 +39,11 @@
 #include "pass_level5.h"
 
 #include "pass_ncnn.h"
+#include "save_ncnn.h"
+
+#if BUILD_PNNX2ONNX
+#include "save_onnx.h"
+#endif
 
 static std::string get_basename(const std::string& path)
 {
@@ -159,9 +164,11 @@ static void show_usage()
     fprintf(stderr, "  pnnxparam=model.pnnx.param\n");
     fprintf(stderr, "  pnnxbin=model.pnnx.bin\n");
     fprintf(stderr, "  pnnxpy=model_pnnx.py\n");
+    fprintf(stderr, "  pnnxonnx=model.pnnx.onnx\n");
     fprintf(stderr, "  ncnnparam=model.ncnn.param\n");
     fprintf(stderr, "  ncnnbin=model.ncnn.bin\n");
     fprintf(stderr, "  ncnnpy=model_ncnn.py\n");
+    fprintf(stderr, "  fp16=1\n");
     fprintf(stderr, "  optlevel=2\n");
     fprintf(stderr, "  device=cpu/gpu\n");
     fprintf(stderr, "  inputshape=[1,3,224,224],...\n");
@@ -200,9 +207,11 @@ int main(int argc, char** argv)
     std::string pnnxparampath = ptbase + ".pnnx.param";
     std::string pnnxbinpath = ptbase + ".pnnx.bin";
     std::string pnnxpypath = ptbase + "_pnnx.py";
+    std::string pnnxonnxpath = ptbase + ".pnnx.onnx";
     std::string ncnnparampath = ptbase + ".ncnn.param";
     std::string ncnnbinpath = ptbase + ".ncnn.bin";
     std::string ncnnpypath = ptbase + "_ncnn.py";
+    int fp16 = 1;
     int optlevel = 2;
     std::string device = "cpu";
     std::vector<std::vector<int64_t> > input_shapes;
@@ -235,12 +244,16 @@ int main(int argc, char** argv)
             pnnxbinpath = std::string(value);
         if (strcmp(key, "pnnxpy") == 0)
             pnnxpypath = std::string(value);
+        if (strcmp(key, "pnnxonnx") == 0)
+            pnnxonnxpath = std::string(value);
         if (strcmp(key, "ncnnparam") == 0)
             ncnnparampath = std::string(value);
         if (strcmp(key, "ncnnbin") == 0)
             ncnnbinpath = std::string(value);
         if (strcmp(key, "ncnnpy") == 0)
             ncnnpypath = std::string(value);
+        if (strcmp(key, "fp16") == 0)
+            fp16 = atoi(value);
         if (strcmp(key, "optlevel") == 0)
             optlevel = atoi(value);
         if (strcmp(key, "device") == 0)
@@ -260,9 +273,11 @@ int main(int argc, char** argv)
         fprintf(stderr, "pnnxparam = %s\n", pnnxparampath.c_str());
         fprintf(stderr, "pnnxbin = %s\n", pnnxbinpath.c_str());
         fprintf(stderr, "pnnxpy = %s\n", pnnxpypath.c_str());
+        fprintf(stderr, "pnnxonnx = %s\n", pnnxonnxpath.c_str());
         fprintf(stderr, "ncnnparam = %s\n", ncnnparampath.c_str());
         fprintf(stderr, "ncnnbin = %s\n", ncnnbinpath.c_str());
         fprintf(stderr, "ncnnpy = %s\n", ncnnpypath.c_str());
+        fprintf(stderr, "fp16 = %d\n", fp16);
         fprintf(stderr, "optlevel = %d\n", optlevel);
         fprintf(stderr, "device = %s\n", device.c_str());
         fprintf(stderr, "inputshape = ");
@@ -327,7 +342,7 @@ int main(int argc, char** argv)
 
     try
     {
-        mod = torch::jit::load(ptpath);
+        mod = torch::jit::load(ptpath, (device == "gpu") ? c10::kCUDA : c10::kCPU);
     }
     catch (const c10::Error& e)
     {
@@ -358,8 +373,9 @@ int main(int argc, char** argv)
 
     fprintf(stderr, "############# pass_level0\n");
 
-    std::map<std::string, pnnx::Attribute> foldable_constants;
-    pnnx::pass_level0(mod, g, input_tensors, input_tensors2, module_operators, ptpath, foldable_constants);
+    std::set<std::string> foldable_constants;
+    std::string foldable_constants_zippath = ptbase + ".foldable_constants.zip";
+    pnnx::pass_level0(mod, g, input_tensors, input_tensors2, module_operators, ptpath, device, foldable_constants, foldable_constants_zippath);
 
     //     g->dump();
 
@@ -393,20 +409,29 @@ int main(int argc, char** argv)
     {
         fprintf(stderr, "############# pass_level5\n");
 
-        pnnx::pass_level5(pnnx_graph, foldable_constants);
+        pnnx::pass_level5(pnnx_graph, foldable_constants, foldable_constants_zippath);
     }
 
+    // delete foldable_constants_zippath
+    remove(foldable_constants_zippath.c_str());
+
     pnnx_graph.save(pnnxparampath, pnnxbinpath);
 
     pnnx_graph.python(pnnxpypath, pnnxbinpath);
 
+#if BUILD_PNNX2ONNX
+    pnnx::save_onnx(pnnx_graph, pnnxonnxpath.c_str(), fp16);
+#else
+    fprintf(stderr, "pnnx build without onnx-zero support, skip saving onnx\n");
+#endif
+
     //     if (optlevel >= 2)
     {
         fprintf(stderr, "############# pass_ncnn\n");
 
         pnnx::pass_ncnn(pnnx_graph);
 
-        pnnx_graph.ncnn(ncnnparampath, ncnnbinpath, ncnnpypath);
+        pnnx::save_ncnn(pnnx_graph, ncnnparampath, ncnnbinpath, ncnnpypath, fp16);
     }
 
     //     pnnx::Graph pnnx_graph2;
diff --git a/tools/pnnx/src/onnx.proto b/tools/pnnx/src/onnx.proto
new file mode 100644
index 00000000000..461bd0b78cd
--- /dev/null
+++ b/tools/pnnx/src/onnx.proto
@@ -0,0 +1,505 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the MIT license.
+
+syntax = "proto2";
+
+package onnx;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short. 
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+// 
+// To simplify framework compatibility, ONNX is defined using the subset of protobuf 
+// that is compatible with both protobuf v2 and v3. This means that we do not use any
+// protobuf features that are only available in one of the two versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control. 
+  // For the IR, we are using simple numbers starting with with 0x00000001, 
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_2017_11_3 = 0x0000000000000003;
+
+  // IR VERSION 4 published on Jan 22, 2019
+  // - Relax constraint that initializers should be a subset of graph inputs
+  // - Add type BFLOAT16
+  IR_VERSION_2019_1_22 = 0x0000000000000004;
+
+  // IR VERSION 5 published on March 18, 2019
+  // - Add message TensorAnnotation.
+  // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters.
+  IR_VERSION = 0x0000000000000005;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  optional string name = 1;           // namespace Attribute
+ 
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  optional string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field hueristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accommodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  optional float f = 2;               // float
+  optional int64 i = 3;               // int
+  optional bytes s = 4;               // UTF-8 string
+  optional TensorProto t = 5;         // tensor value
+  optional GraphProto g = 6;          // graph
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  optional string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR.
+  optional TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  optional string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  optional string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  optional string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  optional string domain = 7;   // namespace Domain
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  optional string doc_string = 6;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto.
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  optional int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  optional string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  optional int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  optional GraphProto graph = 7;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  optional string key = 1;
+  optional string value= 2;
+};
+
+message TensorAnnotation {
+  optional string tensor_name = 1;
+  // <key, value> pairs to annotate tensor specified by <tensor_name> above.
+  // The keys used in the mapping below must be pre-defined in ONNX spec.
+  // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as
+  // quantization parameter keys.
+  repeated StringStringEntryProto quant_parameter_tensor_names = 2;
+}
+
+
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized 
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  optional string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // MAY also appear in the input list.
+  repeated TensorProto initializer = 5;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  optional string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // This field carries information to indicate the mapping among a tensor and its
+  // quantization parameter tensors. For example:
+  // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated,
+  // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
+  repeated TensorAnnotation quantization_annotation = 14;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // IEEE754 half-precision floating-point format (16 bits wide).
+    // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
+    FLOAT16 = 10;
+
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+
+    // Non-IEEE floating-point format based on IEEE754 single-precision
+    // floating-point number truncated to 16 bits.
+    // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
+    BFLOAT16 = 16;
+
+    // Future extensions go here.
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+
+  // The data type of the tensor.
+  // This field MUST have a valid TensorProto.DataType value
+  optional int32 data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    optional int64 begin = 1;
+    optional int64 end = 2;
+  }
+  optional Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and float16 values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  optional string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  optional string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  optional bytes raw_data = 9;
+
+  // Data can be stored inside the protobuf file using type-specific fields or raw_data.
+  // Alternatively, raw bytes data can be stored in an external file, using the external_data field.
+  // external_data stores key-value pairs describing data location. Recognized keys are:
+  // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX
+  //                           protobuf model was stored
+  // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string.
+  //                         Offset values SHOULD be multiples 4096 (page size) to enable mmap support.
+  // - "length" (optional) - number of bytes containing data. Integer stored as string.
+  // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key.
+  repeated StringStringEntryProto external_data = 13;
+
+  // Location of the data for this tensor. MUST be one of:
+  // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field.
+  // - EXTERNAL - data stored in an external location as described by external_data field.
+  enum DataLocation {
+    DEFAULT = 0;
+    EXTERNAL = 1;
+  }
+
+  // If value not set, data is stored in raw_data (if set) otherwise in type-specified field.
+  optional DataLocation data_location = 14;
+
+  // For double
+  // Complex128 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    optional string denotation = 3;
+  };
+  repeated Dimension dim = 1;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    optional int32 elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+
+  }
+
+  // An optional denotation can be used to denote the whole 
+  // type with a standard semantic description as to what is 
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  optional string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  optional string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  optional int64 version = 2;
+}
diff --git a/tools/pnnx/src/pass_level0.cpp b/tools/pnnx/src/pass_level0.cpp
index d50f71bbe29..ba7b7d5109f 100644
--- a/tools/pnnx/src/pass_level0.cpp
+++ b/tools/pnnx/src/pass_level0.cpp
@@ -16,19 +16,22 @@
 
 #include "pass_level0/constant_unpooling.h"
 #include "pass_level0/inline_block.h"
+#include "pass_level0/reset_device.h"
 #include "pass_level0/shape_inference.h"
 
 namespace pnnx {
 
-void pass_level0(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& g, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, std::map<std::string, Attribute>& foldable_constants)
+void pass_level0(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& g, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, const std::string& device, std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath)
 {
     inline_block(g, module_operators);
 
+    reset_device(g, device);
+
     constant_unpooling(g);
 
     if (!input_tensors.empty())
     {
-        shape_inference(mod, g, input_tensors, input_tensors2, module_operators, ptpath, foldable_constants);
+        shape_inference(mod, g, input_tensors, input_tensors2, module_operators, ptpath, device, foldable_constants, foldable_constants_zippath);
     }
 }
 
diff --git a/tools/pnnx/src/pass_level0.h b/tools/pnnx/src/pass_level0.h
index 11543ddc8ff..783a8522d4b 100644
--- a/tools/pnnx/src/pass_level0.h
+++ b/tools/pnnx/src/pass_level0.h
@@ -20,7 +20,7 @@
 
 namespace pnnx {
 
-void pass_level0(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& g, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, std::map<std::string, Attribute>& foldable_constants);
+void pass_level0(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& g, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, const std::string& device, std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath);
 
 } // namespace pnnx
 
diff --git a/tools/pnnx/src/pass_level0/reset_device.cpp b/tools/pnnx/src/pass_level0/reset_device.cpp
new file mode 100644
index 00000000000..b817e41a1f4
--- /dev/null
+++ b/tools/pnnx/src/pass_level0/reset_device.cpp
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "reset_device.h"
+#include "../pass_level1.h"
+
+namespace pnnx {
+
+void reset_device(std::shared_ptr<torch::jit::Graph>& graph, const std::string& device)
+{
+    for (torch::jit::Node* n : graph->nodes())
+    {
+        if (n->kind().toDisplayString() == std::string("aten::to"))
+        {
+            if (n->hasNamedInput("device"))
+            {
+                torch::jit::Node* device_node = n->namedInput("device")->node();
+
+                device_node->s_(torch::jit::attr::value, (device == "gpu") ? "cuda" : "cpu");
+            }
+        }
+    }
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level0/reset_device.h b/tools/pnnx/src/pass_level0/reset_device.h
new file mode 100644
index 00000000000..17d8f93995e
--- /dev/null
+++ b/tools/pnnx/src/pass_level0/reset_device.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <torch/script.h>
+
+namespace pnnx {
+
+void reset_device(std::shared_ptr<torch::jit::Graph>& graph, const std::string& device);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level0/shape_inference.cpp b/tools/pnnx/src/pass_level0/shape_inference.cpp
index a7274f37013..b3fcf4a944a 100644
--- a/tools/pnnx/src/pass_level0/shape_inference.cpp
+++ b/tools/pnnx/src/pass_level0/shape_inference.cpp
@@ -15,8 +15,10 @@
 #include "shape_inference.h"
 #include <unordered_set>
 
+#include "storezip.h"
 #include "pass_level0/constant_unpooling.h"
 #include "pass_level0/inline_block.h"
+#include "pass_level0/reset_device.h"
 #include "pass_level0/shape_inference.h"
 
 namespace pnnx {
@@ -27,7 +29,15 @@ static bool value_link_input(const torch::jit::Value* v, const std::vector<torch
     {
         // any intermediate shape is constant with static input shape
         std::string optype = v->node()->kind().toDisplayString();
-        if (optype == "aten::size" || optype == "aten::new_empty" || optype == "aten::new_ones" || optype == "aten::new_zeros")
+        if (optype == "aten::size"
+                || optype == "aten::new_empty"
+                || optype == "aten::new_full"
+                || optype == "aten::new_ones"
+                || optype == "aten::new_zeros"
+                || optype == "aten::empty_like"
+                || optype == "aten::full_like"
+                || optype == "aten::ones_like"
+                || optype == "aten::zeros_like")
             return false;
     }
 
@@ -69,7 +79,7 @@ static bool value_link_output(const torch::jit::Value* v, const std::vector<torc
     return false;
 }
 
-void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& graph, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, std::map<std::string, Attribute>& foldable_constants)
+void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& graph, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, const std::string& device, std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath)
 {
     // collect all intermediate output tensors
     std::vector<std::unordered_set<std::string> > more_value_names;
@@ -133,7 +143,8 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::
         inputs2.push_back(it);
     }
 
-    std::map<torch::jit::Value*, at::Tensor> output_tensors;
+    StoreZipWriter zip;
+    zip.open(foldable_constants_zippath);
 
     for (size_t p = 0; p < more_value_names.size(); p++)
     {
@@ -142,13 +153,15 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::
 
         // auto mod2 = mod.deepcopy();
 
-        torch::jit::Module mod2 = torch::jit::load(ptpath);
+        torch::jit::Module mod2 = torch::jit::load(ptpath, (device == "gpu") ? c10::kCUDA : c10::kCPU);
         mod2.eval();
 
         auto graph2 = mod2.get_method("forward").graph();
 
         inline_block(graph2, module_operators);
 
+        reset_device(graph2, device);
+
         constant_unpooling(graph2);
 
         std::vector<torch::jit::Value*> values2;
@@ -163,7 +176,7 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::
                 if (value_names.find(v->debugName()) != value_names.end())
                 {
                     values2.push_back(v);
-                    fprintf(stderr, "%s  ", v->debugName().c_str());
+                    // fprintf(stderr, "%s  ", v->debugName().c_str());
                 }
             }
         }
@@ -193,7 +206,16 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::
                 // check if value that does not depend on inputs
                 if (!value_link_input(v, g_inputs, true) && value_link_output(v, g_outputs))
                 {
-                    output_tensors[v] = t;
+                    // output_tensors[v] = t;
+                    const int ndim = (int)t.dim();
+                    if (ndim > 0)
+                    {
+                        // fprintf(stderr, "foldable_constant %s\n", v->debugName().c_str());
+                        foldable_constants.insert(v->debugName());
+
+                        at::Tensor t2 = t.cpu().contiguous();
+                        zip.write_file(v->debugName(), (const char*)t2.data_ptr(), t2.nbytes());
+                    }
                 }
             }
         }
@@ -231,12 +253,23 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::
                 // check if value that does not depend on inputs
                 if (!value_link_input(v, g_inputs, false) && value_link_output(v, g_outputs))
                 {
-                    output_tensors[v] = t;
+                    // output_tensors[v] = t;
+                    const int ndim = (int)t.dim();
+                    if (ndim > 0)
+                    {
+                        // fprintf(stderr, "foldable_constant %s\n", v->debugName().c_str());
+                        foldable_constants.insert(v->debugName());
+
+                        at::Tensor t2 = t.cpu().contiguous();
+                        zip.write_file(v->debugName(), (const char*)t2.data_ptr(), t2.nbytes());
+                    }
                 }
             }
         }
     }
 
+    zip.close();
+
     if (input_tensors2.empty())
     {
         for (size_t i = 0; i < input_tensors.size(); i++)
@@ -269,33 +302,6 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::
             graph->inputs()[1 + i]->setType(finaltype);
         }
     }
-
-    for (auto xx : output_tensors)
-    {
-        auto v = xx.first;
-        auto tensor = xx.second;
-
-        bool link_to_output = false;
-        for (size_t i = 0; i < v->uses().size(); i++)
-        {
-            auto node = v->uses()[i].user;
-            for (auto x : node->outputs())
-            {
-                if (output_tensors.find(x) == output_tensors.end())
-                {
-                    link_to_output = true;
-                    break;
-                }
-            }
-        }
-
-        const int ndim = (int)tensor.dim();
-        if (link_to_output && ndim > 0)
-        {
-            fprintf(stderr, "foldable_constant %s\n", v->debugName().c_str());
-            foldable_constants[v->debugName()] = Attribute(tensor);
-        }
-    }
 }
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level0/shape_inference.h b/tools/pnnx/src/pass_level0/shape_inference.h
index cf80ade7abe..feabfffe64b 100644
--- a/tools/pnnx/src/pass_level0/shape_inference.h
+++ b/tools/pnnx/src/pass_level0/shape_inference.h
@@ -18,6 +18,6 @@
 
 namespace pnnx {
 
-void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& graph, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, std::map<std::string, Attribute>& foldable_constants);
+void shape_inference(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Graph>& graph, const std::vector<at::Tensor>& input_tensors, const std::vector<at::Tensor>& input_tensors2, const std::vector<std::string>& module_operators, const std::string& ptpath, const std::string& device, std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath);
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level1.cpp b/tools/pnnx/src/pass_level1.cpp
index 9b5e4460905..0aaf4d897a8 100644
--- a/tools/pnnx/src/pass_level1.cpp
+++ b/tools/pnnx/src/pass_level1.cpp
@@ -376,10 +376,6 @@ void pass_level1(const torch::jit::Module& mod, const std::shared_ptr<torch::jit
 
             Operator* op = pg.new_operator(n->kind().toDisplayString(), name);
 
-            // always treat inplace op type as non-inplace version
-            if (op->type.size() > 2 && op->type[op->type.size() - 2] != '_' && op->type[op->type.size() - 1] == '_')
-                op->type = op->type.substr(0, op->type.size() - 1);
-
             for (int i = 0; i < (int)n->inputs().size(); i++)
             {
                 const auto& in = n->input(i);
diff --git a/tools/pnnx/src/pass_level1/nn_Fold.cpp b/tools/pnnx/src/pass_level1/nn_Fold.cpp
new file mode 100644
index 00000000000..045c1f6f1ba
--- /dev/null
+++ b/tools/pnnx/src/pass_level1/nn_Fold.cpp
@@ -0,0 +1,48 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level1.h"
+
+#include "../utils.h"
+
+namespace pnnx {
+
+class Fold : public FuseModulePass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "__torch__.torch.nn.modules.fold.Fold";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Fold";
+    }
+
+    void write(Operator* op, const std::shared_ptr<torch::jit::Graph>& graph) const
+    {
+        const torch::jit::Node* col2im = find_node_by_kind(graph, "aten::col2im");
+
+        op->params["output_size"] = col2im->namedInput("output_size");
+        op->params["kernel_size"] = col2im->namedInput("kernel_size");
+        op->params["stride"] = col2im->namedInput("stride");
+        op->params["padding"] = col2im->namedInput("padding");
+        op->params["dilation"] = col2im->namedInput("dilation");
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(Fold)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level1/nn_GLU.cpp b/tools/pnnx/src/pass_level1/nn_GLU.cpp
new file mode 100644
index 00000000000..72af2f3f0a6
--- /dev/null
+++ b/tools/pnnx/src/pass_level1/nn_GLU.cpp
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//               2022 Xiaomi Corp.     (author: Fangjun Kuang)
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level1.h"
+
+#include "../utils.h"
+
+namespace pnnx {
+
+class GLU : public FuseModulePass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "__torch__.torch.nn.modules.activation.GLU";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.GLU";
+    }
+
+    void write(Operator* op, const std::shared_ptr<torch::jit::Graph>& graph) const
+    {
+        const torch::jit::Node* glu = find_node_by_kind(graph, "aten::glu");
+
+        op->params["dim"] = glu->namedInput("dim");
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(GLU)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level1/nn_LSTM.cpp b/tools/pnnx/src/pass_level1/nn_LSTM.cpp
index cf345dce2ae..a2354dfadfa 100644
--- a/tools/pnnx/src/pass_level1/nn_LSTM.cpp
+++ b/tools/pnnx/src/pass_level1/nn_LSTM.cpp
@@ -33,9 +33,9 @@ class LSTM : public FuseModulePass
 
     void write(Operator* op, const std::shared_ptr<torch::jit::Graph>& graph, const torch::jit::Module& mod) const
     {
-        //         mod.dump(true, true, true);
-
-        //         graph->dump();
+        // mod.dump(true, true, true);
+        //
+        // graph->dump();
 
         const torch::jit::Node* lstm = find_node_by_kind(graph, "aten::lstm");
 
@@ -49,12 +49,13 @@ class LSTM : public FuseModulePass
             op->params["pnnx_rnn_output_swapped"] = 1;
         }
 
-        //         for (auto aa : lstm->schema().arguments())
-        //         {
-        //             fprintf(stderr, "arg %s\n", aa.name().c_str());
-        //         }
+        // for (auto aa : lstm->schema().arguments())
+        // {
+        //     fprintf(stderr, "arg %s\n", aa.name().c_str());
+        // }
 
         const auto& weight_ih_l0 = mod.attr("weight_ih_l0").toTensor();
+        const auto& weight_hh_l0 = mod.attr("weight_hh_l0").toTensor();
 
         op->params["input_size"] = weight_ih_l0.size(1);
         op->params["hidden_size"] = weight_ih_l0.size(0) / 4;
@@ -62,17 +63,12 @@ class LSTM : public FuseModulePass
         op->params["bias"] = lstm->namedInput("has_biases");
         op->params["batch_first"] = lstm->namedInput("batch_first");
         op->params["bidirectional"] = lstm->namedInput("bidirectional");
-
-        int32_t proj_size = 0;
-        if (mod.hasattr("weight_hr_l0")) {
-          torch::Tensor w_hr = mod.attr("weight_hr_l0").toTensor();
-          proj_size = w_hr.size(0);
-        }
-        op->params["proj_size"] = proj_size;
+        op->params["proj_size"] = weight_ih_l0.size(0) / 4 == weight_hh_l0.size(1) ? 0 : weight_hh_l0.size(1);
 
         const int num_layers = op->params["num_layers"].i;
         const bool bias = op->params["bias"].b;
         const bool bidirectional = op->params["bidirectional"].b;
+        const int proj_size = op->params["proj_size"].i;
 
         for (int k = 0; k < num_layers; k++)
         {
@@ -82,11 +78,6 @@ class LSTM : public FuseModulePass
             op->attrs[weight_ih_lk_key] = mod.attr(weight_ih_lk_key).toTensor();
             op->attrs[weight_hh_lk_key] = mod.attr(weight_hh_lk_key).toTensor();
 
-            if (proj_size) {
-              std::string weight_hr_lk_key = std::string("weight_hr_l") + std::to_string(k);
-              op->attrs[weight_hr_lk_key] = mod.attr(weight_hr_lk_key).toTensor();
-            }
-
             if (bias)
             {
                 std::string bias_ih_lk_key = std::string("bias_ih_l") + std::to_string(k);
@@ -96,6 +87,13 @@ class LSTM : public FuseModulePass
                 op->attrs[bias_hh_lk_key] = mod.attr(bias_hh_lk_key).toTensor();
             }
 
+            if (proj_size > 0)
+            {
+                std::string weight_hr_lk_key = std::string("weight_hr_l") + std::to_string(k);
+
+                op->attrs[weight_hr_lk_key] = mod.attr(weight_hr_lk_key).toTensor();
+            }
+
             if (bidirectional)
             {
                 std::string weight_ih_lk_reverse_key = std::string("weight_ih_l") + std::to_string(k) + "_reverse";
@@ -112,6 +110,13 @@ class LSTM : public FuseModulePass
                     op->attrs[bias_ih_lk_reverse_key] = mod.attr(bias_ih_lk_reverse_key).toTensor();
                     op->attrs[bias_hh_lk_reverse_key] = mod.attr(bias_hh_lk_reverse_key).toTensor();
                 }
+
+                if (proj_size > 0)
+                {
+                    std::string weight_hr_lk_reverse_key = std::string("weight_hr_l") + std::to_string(k) + "_reverse";
+
+                    op->attrs[weight_hr_lk_reverse_key] = mod.attr(weight_hr_lk_reverse_key).toTensor();
+                }
             }
         }
     }
diff --git a/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp
index 5a54ac442db..608bca6fdbc 100644
--- a/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp
+++ b/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp
@@ -39,45 +39,75 @@ class MultiheadAttention : public FuseModulePass
 
         //         graph->dump();
 
-        const torch::jit::Node* div_num_heads = find_node_by_kind(graph, "aten::div");
-        const torch::jit::Node* div_num_heads_18 = find_node_by_kind(graph, "aten::floor_divide");
-        if (div_num_heads_18)
+        const torch::jit::Node* multi_head_attention = find_node_by_kind(graph, "aten::_native_multi_head_attention");
+        if (multi_head_attention)
         {
-            div_num_heads = div_num_heads_18;
+            op->params["num_heads"] = multi_head_attention->namedInput("num_head");
+            op->params["batch_first"] = true;
+            op->params["add_zero_attn"] = false;
         }
+        else
+        {
+            const torch::jit::Node* div_num_heads = find_node_by_kind(graph, "aten::div");
+            const torch::jit::Node* div_num_heads_18 = find_node_by_kind(graph, "aten::floor_divide");
+            if (div_num_heads_18)
+            {
+                div_num_heads = div_num_heads_18;
+            }
 
-        op->params["num_heads"] = (int)div_num_heads->input(1)->node()->t(torch::jit::attr::value).item<int64_t>();
+            op->params["num_heads"] = (int)div_num_heads->input(1)->node()->t(torch::jit::attr::value).item<int64_t>();
 
-        const torch::jit::Node* transpose_batch_seq = find_node_by_kind(graph, "aten::transpose");
+            const torch::jit::Node* transpose_batch_seq = find_node_by_kind(graph, "aten::transpose");
 
-        int transpose_dim0 = transpose_batch_seq->input(1)->node()->i(torch::jit::attr::value);
-        int transpose_dim1 = transpose_batch_seq->input(2)->node()->i(torch::jit::attr::value);
-        if (transpose_dim0 == 1 && transpose_dim1 == 0)
-        {
-            op->params["batch_first"] = true;
-        }
+            int transpose_dim0 = transpose_batch_seq->input(1)->node()->i(torch::jit::attr::value);
+            int transpose_dim1 = transpose_batch_seq->input(2)->node()->i(torch::jit::attr::value);
+            if (transpose_dim0 == 1 && transpose_dim1 == 0)
+            {
+                op->params["batch_first"] = true;
+            }
 #if TORCH_VERSION_MAJOR == 1 && TORCH_VERSION_MINOR >= 9
-        else
-        {
-            op->params["batch_first"] = false;
-        }
+            else
+            {
+                op->params["batch_first"] = false;
+            }
 #endif
 
-        const torch::jit::Node* add_zero_attn = find_node_by_kind(graph, "aten::zeros");
-        if (add_zero_attn)
+            const torch::jit::Node* add_zero_attn = find_node_by_kind(graph, "aten::zeros");
+            if (add_zero_attn)
+            {
+                op->params["add_zero_attn"] = true;
+            }
+            else
+            {
+                op->params["add_zero_attn"] = false;
+            }
+        }
+
+        if (mod.hasattr("in_proj_weight"))
         {
-            op->params["add_zero_attn"] = true;
+            const auto& in_proj_weight = mod.attr("in_proj_weight").toTensor();
+
+            op->params["embed_dim"] = in_proj_weight.size(1);
+            op->params["kdim"] = in_proj_weight.size(1);
+            op->params["vdim"] = in_proj_weight.size(1);
+            op->attrs["in_proj_weight"] = in_proj_weight;
         }
         else
         {
-            op->params["add_zero_attn"] = false;
+            const auto& q_proj_weight = mod.attr("q_proj_weight").toTensor();
+            const auto& k_proj_weight = mod.attr("k_proj_weight").toTensor();
+            const auto& v_proj_weight = mod.attr("v_proj_weight").toTensor();
+
+            op->params["embed_dim"] = q_proj_weight.size(1);
+            op->params["kdim"] = k_proj_weight.size(1);
+            op->params["vdim"] = v_proj_weight.size(1);
+            op->attrs["q_proj_weight"] = q_proj_weight;
+            op->attrs["k_proj_weight"] = k_proj_weight;
+            op->attrs["v_proj_weight"] = v_proj_weight;
         }
 
-        const auto& in_proj_weight = mod.attr("in_proj_weight").toTensor();
         const auto& out_proj_weight = mod.attr("out_proj").toModule().attr("weight").toTensor();
 
-        op->params["embed_dim"] = in_proj_weight.size(1);
-        op->attrs["in_proj_weight"] = in_proj_weight;
         op->attrs["out_proj.weight"] = out_proj_weight;
 
         if (mod.hasattr("in_proj_bias") && mod.attr("out_proj").toModule().hasattr("bias"))
diff --git a/tools/pnnx/src/pass_level1/nn_Softmax2d.cpp b/tools/pnnx/src/pass_level1/nn_Softmax2d.cpp
new file mode 100644
index 00000000000..c8040406623
--- /dev/null
+++ b/tools/pnnx/src/pass_level1/nn_Softmax2d.cpp
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level1.h"
+
+#include "../utils.h"
+
+namespace pnnx {
+
+class Softmax2d : public FuseModulePass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "__torch__.torch.nn.modules.activation.Softmax2d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Softmax2d";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(Softmax2d)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level1/nn_Unfold.cpp b/tools/pnnx/src/pass_level1/nn_Unfold.cpp
new file mode 100644
index 00000000000..1abf6201a83
--- /dev/null
+++ b/tools/pnnx/src/pass_level1/nn_Unfold.cpp
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level1.h"
+
+#include "../utils.h"
+
+namespace pnnx {
+
+class Unfold : public FuseModulePass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "__torch__.torch.nn.modules.fold.Unfold";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Unfold";
+    }
+
+    void write(Operator* op, const std::shared_ptr<torch::jit::Graph>& graph) const
+    {
+        const torch::jit::Node* im2col = find_node_by_kind(graph, "aten::im2col");
+
+        op->params["kernel_size"] = im2col->namedInput("kernel_size");
+        op->params["stride"] = im2col->namedInput("stride");
+        op->params["padding"] = im2col->namedInput("padding");
+        op->params["dilation"] = im2col->namedInput("dilation");
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(Unfold)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2.cpp b/tools/pnnx/src/pass_level2.cpp
index b461d1e5c0f..e9a98d4b267 100644
--- a/tools/pnnx/src/pass_level2.cpp
+++ b/tools/pnnx/src/pass_level2.cpp
@@ -39,6 +39,11 @@ bool GraphRewriterPass::match(const std::map<std::string, Parameter>& captured_p
     return match(captured_params);
 }
 
+bool GraphRewriterPass::match(const std::map<std::string, const Operator*>& /*matched_operators*/) const
+{
+    return true;
+}
+
 void GraphRewriterPass::write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
 {
     for (auto x : captured_params)
@@ -215,7 +220,7 @@ static bool match_operator(const Operator* a, const Operator* b, std::map<std::s
     return true;
 }
 
-static bool match(const Operator* anchor, const Operator* pattern, std::unordered_map<std::string, const Operator*>& matched_operators, std::unordered_map<std::string, const Operand*>& matched_inputs, std::map<std::string, Parameter>& captured_params, std::map<std::string, Attribute>& captured_attrs)
+static bool match(const Operator* anchor, const Operator* pattern, std::map<std::string, const Operator*>& matched_operators, std::map<std::string, const Operand*>& matched_inputs, std::map<std::string, Parameter>& captured_params, std::map<std::string, Attribute>& captured_attrs)
 {
     if (!match_operator(anchor, pattern, captured_params, captured_attrs))
         return false;
@@ -290,9 +295,9 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
         bool matched = true;
 
         // lets match from output
-        std::unordered_map<std::string, const Operator*> matched_operators;
-        std::unordered_map<std::string, const Operand*> matched_inputs;
-        std::unordered_map<std::string, const Operand*> matched_outputs;
+        std::map<std::string, const Operator*> matched_operators;
+        std::map<std::string, const Operand*> matched_inputs;
+        std::map<std::string, const Operand*> matched_outputs;
         std::map<std::string, Parameter> captured_params;
         std::map<std::string, Attribute> captured_attrs;
 
@@ -311,8 +316,8 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
                     {
                         const Operator* anchor = graph.ops[j];
 
-                        std::unordered_map<std::string, const Operator*> matched_operators2;
-                        std::unordered_map<std::string, const Operand*> matched_inputs2;
+                        std::map<std::string, const Operator*> matched_operators2;
+                        std::map<std::string, const Operand*> matched_inputs2;
                         std::map<std::string, Parameter> captured_params2;
                         std::map<std::string, Attribute> captured_attrs2;
                         if (!match(anchor, pattern2, matched_operators2, matched_inputs2, captured_params2, captured_attrs2))
@@ -372,7 +377,7 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
                     break;
             }
 
-            if (matched && !pass->match(captured_params, captured_attrs))
+            if (matched && (!pass->match(captured_params, captured_attrs) || !pass->match(matched_operators)))
             {
                 matched_operators.clear();
                 matched_inputs.clear();
@@ -393,7 +398,7 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
         // lets replace
 
         // remove all operands inside matched graph
-        std::unordered_map<std::string, Operand*> operands_to_remove;
+        std::map<std::string, Operand*> operands_to_remove;
         for (auto& _x : matched_operators)
         {
             Operator* x = (Operator*)_x.second;
@@ -502,8 +507,112 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
     }
 }
 
+static void fix_inplace_copy_output(Graph& graph)
+{
+    while (1)
+    {
+        bool matched = false;
+        for (size_t i = 0; i < graph.ops.size(); i++)
+        {
+            Operator* op = graph.ops[i];
+
+            bool is_inplace_op = op->type.size() > 2 && op->type[op->type.size() - 2] != '_' && op->type[op->type.size() - 1] == '_';
+            if (!is_inplace_op)
+                continue;
+
+            // replace inplace op with non-inplace version
+            op->type = op->type.substr(0, op->type.size() - 1);
+
+            if (op->type == "aten::copy")
+                continue;
+
+            if (op->outputs[0]->consumers.size() != 0)
+                continue;
+
+            matched = true;
+
+            // find in0 from slice / select chain
+            Operand* in0 = op->inputs[0];
+            while (in0->producer->type == "aten::slice" || in0->producer->type == "aten::select")
+            {
+                in0 = in0->producer->inputs[0];
+            }
+
+            // append copy for inplace op
+            Operator* op_copy = graph.new_operator_after("aten::copy", op->name + "_copy", op);
+            Operand* copy_out = graph.new_operand(op->name + "_copy_out");
+
+            copy_out->shape = in0->shape;
+
+            op_copy->inputs.push_back(op->inputs[0]);
+            op_copy->inputs.push_back(op->outputs[0]);
+            op->inputs[0]->consumers.push_back(op_copy);
+            op->outputs[0]->consumers.push_back(op_copy);
+
+            op_copy->outputs.push_back(copy_out);
+            copy_out->producer = op_copy;
+
+            break;
+        }
+
+        if (!matched)
+            break;
+    }
+
+    for (size_t i = 0; i < graph.ops.size(); i++)
+    {
+        Operator* op = graph.ops[i];
+
+        if (op->type != "aten::copy")
+            continue;
+
+        if (op->outputs[0]->consumers.size() != 0)
+            continue;
+
+        // aten::slice   5 1 in0 .... a
+        // aten::slice   5 1 a .... b
+        // aten::copy    2 1 b in1 out
+
+        // aten::select  3 1 in0 .... a
+        // aten::copy    2 1 a in1 out
+
+        // find in0 from slice / select chain
+        Operand* in0 = op->inputs[0];
+        while (in0->producer->type == "aten::slice" || in0->producer->type == "aten::select")
+        {
+            in0 = in0->producer->inputs[0];
+        }
+
+        // replace all the following uses of in0 with out
+        Operand* out0 = op->outputs[0];
+        out0->shape = in0->shape;
+        for (size_t j = i; j < graph.ops.size(); j++)
+        {
+            Operator* op2 = graph.ops[j];
+
+            bool use_in0 = false;
+            for (size_t k = 0; k < op2->inputs.size(); k++)
+            {
+                if (op2->inputs[k] == in0)
+                {
+                    op2->inputs[k] = out0;
+                    use_in0 = true;
+                }
+            }
+
+            if (use_in0)
+            {
+                in0->remove_consumer(op2);
+                out0->consumers.push_back(op2);
+            }
+        }
+    }
+}
+
 void pass_level2(Graph& g)
 {
+    fix_inplace_copy_output(g);
+
     int opindex = 0;
     for (auto x : g_global_pnnx_graph_rewriter_passes)
     {
diff --git a/tools/pnnx/src/pass_level2.h b/tools/pnnx/src/pass_level2.h
index 1a0562be939..af0fb8346df 100644
--- a/tools/pnnx/src/pass_level2.h
+++ b/tools/pnnx/src/pass_level2.h
@@ -34,6 +34,8 @@ class GraphRewriterPass
 
     virtual bool match(const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const;
 
+    virtual bool match(const std::map<std::string, const Operator*>& matched_operators) const;
+
     virtual void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const;
 
     virtual void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const;
diff --git a/tools/pnnx/src/pass_level2/F_fold.cpp b/tools/pnnx/src/pass_level2/F_fold.cpp
new file mode 100644
index 00000000000..39e3787fbc5
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/F_fold.cpp
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class F_fold : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+8 7
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 output_size
+pnnx.Input              input_2     0 1 kernel_size
+pnnx.Input              input_3     0 1 dilation
+pnnx.Input              input_4     0 1 padding
+pnnx.Input              input_5     0 1 stride
+aten::col2im            op_0        6 1 input output_size kernel_size dilation padding stride out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.fold";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_fold, 10)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_glu.cpp b/tools/pnnx/src/pass_level2/F_glu.cpp
index 5ca26b96ad2..f6ed24408fd 100644
--- a/tools/pnnx/src/pass_level2/F_glu.cpp
+++ b/tools/pnnx/src/pass_level2/F_glu.cpp
@@ -16,21 +16,26 @@
 
 namespace pnnx {
 
-class F_glu : public GraphRewriterPass {
- public:
-  const char *match_pattern_graph() const {
-    return R"PNNXIR(7767517
+class F_glu : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
 4 3
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 dim
 aten::glu               op_1        2 1 input dim out
 pnnx.Output             output      1 0 out
 )PNNXIR";
-  }
+    }
 
-  const char *type_str() const { return "F.glu"; }
+    const char* type_str() const
+    {
+        return "F.glu";
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_glu, 10)
 
-}  // namespace pnnx
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softmax.cpp b/tools/pnnx/src/pass_level2/F_softmax.cpp
index af34a58957c..8a9352beba2 100644
--- a/tools/pnnx/src/pass_level2/F_softmax.cpp
+++ b/tools/pnnx/src/pass_level2/F_softmax.cpp
@@ -25,7 +25,7 @@ class F_softmax : public GraphRewriterPass
 5 4
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 dim
-prim::Constant          op_0        0 1 dtype value=None
+prim::Constant          op_0        0 1 dtype value=*
 aten::softmax           op_1        3 1 input dim dtype out
 pnnx.Output             output      1 0 out
 )PNNXIR";
diff --git a/tools/pnnx/src/pass_level2/F_unfold.cpp b/tools/pnnx/src/pass_level2/F_unfold.cpp
new file mode 100644
index 00000000000..a6a236080d4
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/F_unfold.cpp
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class F_unfold : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+7 6
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 kernel_size
+pnnx.Input              input_2     0 1 dilation
+pnnx.Input              input_3     0 1 padding
+pnnx.Input              input_4     0 1 stride
+aten::im2col            op_0        5 1 input kernel_size dilation padding stride out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.unfold";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_unfold, 10)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp b/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp
index c544e8065bb..72b78b41441 100644
--- a/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp
+++ b/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp
@@ -63,6 +63,29 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_upsample_nearest_1, 10)
 
+class F_upsample_nearest_1_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 size
+prim::Constant          op_0        0 1 scale_factor value=None
+aten::upsample_nearest2d op_1       3 1 input size scale_factor out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.upsample_nearest";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_upsample_nearest_1_1, 10)
+
 class F_upsample_nearest_2 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_level2/Tensor_copy.cpp b/tools/pnnx/src/pass_level2/Tensor_copy.cpp
new file mode 100644
index 00000000000..d5369b29e8a
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/Tensor_copy.cpp
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class Tensor_copy : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 self
+pnnx.Input              input_1     0 1 src
+prim::Constant          op_0        0 1 non_blocking value=*
+aten::copy              op_1        3 1 self src non_blocking out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Tensor.copy";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_copy, 20)
+
+class Tensor_copy_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 self
+pnnx.Input              input_1     0 1 src
+aten::copy              op_1        2 1 self src out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Tensor.copy";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_copy_1, 20)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_bitwise_left_shift.cpp b/tools/pnnx/src/pass_level2/torch_bitwise_left_shift.cpp
new file mode 100644
index 00000000000..4fadaad74af
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/torch_bitwise_left_shift.cpp
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class torch_bitwise_left_shift : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 other
+aten::bitwise_left_shift op_0       2 1 input other out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.bitwise_left_shift";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_bitwise_left_shift, 20)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_bitwise_right_shift.cpp b/tools/pnnx/src/pass_level2/torch_bitwise_right_shift.cpp
new file mode 100644
index 00000000000..4db2560da3f
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/torch_bitwise_right_shift.cpp
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class torch_bitwise_right_shift : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 other
+aten::bitwise_right_shift op_0      2 1 input other out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.bitwise_right_shift";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_bitwise_right_shift, 20)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_einsum.cpp b/tools/pnnx/src/pass_level2/torch_einsum.cpp
index 771df403c9e..f6b24757e50 100644
--- a/tools/pnnx/src/pass_level2/torch_einsum.cpp
+++ b/tools/pnnx/src/pass_level2/torch_einsum.cpp
@@ -38,4 +38,27 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_einsum, 20)
 
+class torch_einsum_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 equation
+pnnx.Input              input_1     0 1 operands
+prim::Constant          op_0        0 1 path value=None
+aten::einsum            op_1        3 1 equation operands path out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.einsum";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_einsum_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level3.cpp b/tools/pnnx/src/pass_level3.cpp
index dc272845319..182e5a343fe 100644
--- a/tools/pnnx/src/pass_level3.cpp
+++ b/tools/pnnx/src/pass_level3.cpp
@@ -37,7 +37,7 @@
 
 namespace pnnx {
 
-void pass_level3(Graph& g, const std::map<std::string, Attribute>& foldable_constants)
+void pass_level3(Graph& g, const std::set<std::string>& foldable_constants)
 {
     assign_unique_name(g);
 
diff --git a/tools/pnnx/src/pass_level3.h b/tools/pnnx/src/pass_level3.h
index ac6a0b265fe..53482824fff 100644
--- a/tools/pnnx/src/pass_level3.h
+++ b/tools/pnnx/src/pass_level3.h
@@ -19,7 +19,7 @@
 
 namespace pnnx {
 
-void pass_level3(Graph& g, const std::map<std::string, Attribute>& foldable_constants);
+void pass_level3(Graph& g, const std::set<std::string>& foldable_constants);
 
 } // namespace pnnx
 
diff --git a/tools/pnnx/src/pass_level3/fuse_expression.cpp b/tools/pnnx/src/pass_level3/fuse_expression.cpp
index 645a0a5d7f0..0866e1301c6 100644
--- a/tools/pnnx/src/pass_level3/fuse_expression.cpp
+++ b/tools/pnnx/src/pass_level3/fuse_expression.cpp
@@ -65,30 +65,57 @@ static bool operand_maybe_tensor(const Operand* operand)
         return false;
     }
 
-    if (op->type == "aten::floor_divide" || op->type == "aten::mul" || op->type == "aten::div" || op->type == "aten::pow")
+    if (op->type == "aten::abs"
+            || op->type == "aten::acos"
+            || op->type == "aten::acosh"
+            || op->type == "aten::asin"
+            || op->type == "aten::asinh"
+            || op->type == "aten::atan"
+            || op->type == "aten::atanh"
+            || op->type == "aten::ceil"
+            || op->type == "aten::cos"
+            || op->type == "aten::cosh"
+            || op->type == "aten::exp"
+            || op->type == "aten::floor"
+            || op->type == "aten::log"
+            || op->type == "aten::neg"
+            || op->type == "aten::reciprocal"
+            || op->type == "aten::rsqrt"
+            || op->type == "aten::sign"
+            || op->type == "aten::sin"
+            || op->type == "aten::sinh"
+            || op->type == "aten::sqrt"
+            || op->type == "aten::square"
+            || op->type == "aten::tan"
+            || op->type == "aten::tanh"
+            || op->type == "aten::trunc")
     {
-        return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]);
+        return operand_maybe_tensor(op->inputs[0]);
     }
 
-    if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__")
+    if (op->type == "aten::atan2"
+            || op->type == "aten::div"
+            || op->type == "aten::floor_divide"
+            || op->type == "aten::mul"
+            || op->type == "aten::pow")
     {
         return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]);
     }
 
-    if (op->type == "aten::add" || op->type == "aten::sub" || op->type == "aten::rsub")
+    if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__" || op->type == "aten::__lshift__" || op->type == "aten::__rshift__")
     {
-        return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]) || operand_maybe_tensor(op->inputs[2]);
+        return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]);
     }
 
-    if (op->type == "aten::sqrt" || op->type == "aten::rsqrt" || op->type == "aten::neg" || op->type == "aten::floor" || op->type == "aten::exp")
+    if (op->type == "aten::add" || op->type == "aten::sub" || op->type == "aten::rsub")
     {
-        return operand_maybe_tensor(op->inputs[0]);
+        return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]) || operand_maybe_tensor(op->inputs[2]);
     }
 
     return true;
 }
 
-static bool operand_is_foldable(const Operand* operand, const std::map<std::string, Attribute>& foldable_constants)
+static bool operand_is_foldable(const Operand* operand, const std::set<std::string>& foldable_constants)
 {
     if (foldable_constants.find(operand->name) != foldable_constants.end())
         return true;
@@ -107,7 +134,7 @@ static bool operand_is_foldable(const Operand* operand, const std::map<std::stri
     return true;
 }
 
-static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, std::vector<Operand*>& inputs, const std::map<std::string, Attribute>& foldable_constants, bool checksubgraph = true)
+static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, std::vector<Operand*>& inputs, const std::set<std::string>& foldable_constants, bool checksubgraph = true)
 {
     // fprintf(stderr, "fuse_expression %s\n", operand->name.c_str());
 
@@ -246,7 +273,44 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
     {
         fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
     }
-    else if (op->type == "aten::floor_divide" || op->type == "aten::mul" || op->type == "aten::div" || op->type == "aten::pow" || op->type == "aten::remainder")
+    else if (op->type == "aten::abs"
+             || op->type == "aten::acos"
+             || op->type == "aten::acosh"
+             || op->type == "aten::asin"
+             || op->type == "aten::asinh"
+             || op->type == "aten::atan"
+             || op->type == "aten::atanh"
+             || op->type == "aten::ceil"
+             || op->type == "aten::cos"
+             || op->type == "aten::cosh"
+             || op->type == "aten::exp"
+             || op->type == "aten::floor"
+             || op->type == "aten::log"
+             || op->type == "aten::neg"
+             || op->type == "aten::reciprocal"
+             || op->type == "aten::rsqrt"
+             || op->type == "aten::sign"
+             || op->type == "aten::sin"
+             || op->type == "aten::sinh"
+             || op->type == "aten::sqrt"
+             || op->type == "aten::square"
+             || op->type == "aten::tan"
+             || op->type == "aten::tanh"
+             || op->type == "aten::trunc")
+    {
+        std::string mathop = op->type.substr(6);
+
+        expr += mathop;
+        expr += "(";
+        fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
+        expr += ")";
+    }
+    else if (op->type == "aten::atan2"
+             || op->type == "aten::div"
+             || op->type == "aten::floor_divide"
+             || op->type == "aten::mul"
+             || op->type == "aten::pow"
+             || op->type == "aten::remainder")
     {
         std::string mathop = op->type.substr(6);
 
@@ -257,11 +321,9 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
         fuse_expression(graph, op->inputs[1], expr, inputs, foldable_constants);
         expr += ")";
     }
-    else if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__")
+    else if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__" || op->type == "aten::__lshift__" || op->type == "aten::__rshift__")
     {
-        std::string mathop = op->type.substr(8, 3);
-        if (mathop == "or_")
-            mathop = "or";
+        std::string mathop = op->type.substr(8, op->type.size() - 10);
 
         expr += mathop;
         expr += "(";
@@ -326,36 +388,6 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
         fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
         expr += ")";
     }
-    else if (op->type == "aten::sqrt")
-    {
-        expr += "sqrt(";
-        fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
-        expr += ")";
-    }
-    else if (op->type == "aten::rsqrt")
-    {
-        expr += "rsqrt(";
-        fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
-        expr += ")";
-    }
-    else if (op->type == "aten::neg")
-    {
-        expr += "neg(";
-        fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
-        expr += ")";
-    }
-    else if (op->type == "aten::floor")
-    {
-        expr += "floor(";
-        fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
-        expr += ")";
-    }
-    else if (op->type == "aten::exp")
-    {
-        expr += "exp(";
-        fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants);
-        expr += ")";
-    }
     else
     {
         auto it = std::find(inputs.begin(), inputs.end(), operand);
@@ -378,7 +410,7 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
     }
 }
 
-void fuse_expression(Graph& graph, const std::map<std::string, Attribute>& foldable_constants)
+void fuse_expression(Graph& graph, const std::set<std::string>& foldable_constants)
 {
     int pnnx_expr_index = 0;
 
@@ -415,11 +447,43 @@ void fuse_expression(Graph& graph, const std::map<std::string, Attribute>& folda
             {
                 need_fuse = true;
             }
-            if (op->type == "aten::floor_divide" || op->type == "aten::add" || op->type == "aten::sub" || op->type == "aten::mul" || op->type == "aten::div" || op->type == "aten::sqrt" || op->type == "aten::rsub" || op->type == "aten::rsqrt" || op->type == "aten::neg" || op->type == "aten::pow" || op->type == "aten::remainder" || op->type == "aten::floor" || op->type == "aten::exp")
+            if (op->type == "aten::abs"
+                    || op->type == "aten::acos"
+                    || op->type == "aten::acosh"
+                    || op->type == "aten::add"
+                    || op->type == "aten::asin"
+                    || op->type == "aten::asinh"
+                    || op->type == "aten::atan"
+                    || op->type == "aten::atanh"
+                    || op->type == "aten::atan2"
+                    || op->type == "aten::ceil"
+                    || op->type == "aten::cos"
+                    || op->type == "aten::cosh"
+                    || op->type == "aten::div"
+                    || op->type == "aten::exp"
+                    || op->type == "aten::floor"
+                    || op->type == "aten::floor_divide"
+                    || op->type == "aten::log"
+                    || op->type == "aten::mul"
+                    || op->type == "aten::neg"
+                    || op->type == "aten::pow"
+                    || op->type == "aten::reciprocal"
+                    || op->type == "aten::remainder"
+                    || op->type == "aten::rsqrt"
+                    || op->type == "aten::rsub"
+                    || op->type == "aten::sign"
+                    || op->type == "aten::sin"
+                    || op->type == "aten::sinh"
+                    || op->type == "aten::sqrt"
+                    || op->type == "aten::square"
+                    || op->type == "aten::sub"
+                    || op->type == "aten::tan"
+                    || op->type == "aten::tanh"
+                    || op->type == "aten::trunc")
             {
                 need_fuse = true;
             }
-            if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__")
+            if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__" || op->type == "aten::__lshift__" || op->type == "aten::__rshift__")
             {
                 need_fuse = true;
             }
diff --git a/tools/pnnx/src/pass_level3/fuse_expression.h b/tools/pnnx/src/pass_level3/fuse_expression.h
index 77ae711c8af..5c8297a2158 100644
--- a/tools/pnnx/src/pass_level3/fuse_expression.h
+++ b/tools/pnnx/src/pass_level3/fuse_expression.h
@@ -16,6 +16,6 @@
 
 namespace pnnx {
 
-void fuse_expression(Graph& graph, const std::map<std::string, Attribute>& foldable_constants);
+void fuse_expression(Graph& graph, const std::set<std::string>& foldable_constants);
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp b/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp
index 7eff87273d5..85fcbce1486 100644
--- a/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp
+++ b/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp
@@ -26,7 +26,7 @@ void fuse_input_unpack(Graph& graph)
         {
             Operator* op = graph.ops[i];
 
-            if (op->type != "prim::TupleUnpack")
+            if (op->type != "prim::TupleUnpack" && op->type != "prim::ListUnpack")
                 continue;
 
             if (op->inputs.size() != 1)
diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp
index 1de1d007947..ae365f369df 100644
--- a/tools/pnnx/src/pass_level5.cpp
+++ b/tools/pnnx/src/pass_level5.cpp
@@ -22,9 +22,10 @@
 #include "pass_level5/eliminate_noop_expression.h"
 #include "pass_level5/eliminate_noop_pad.h"
 #include "pass_level5/eliminate_noop_upsample.h"
-#include "pass_level5/eliminate_slice.h"
-#include "pass_level5/eliminate_view_reshape.h"
+#include "pass_level5/eliminate_noop_slice.h"
+#include "pass_level5/eliminate_noop_view_reshape.h"
 #include "pass_level5/eval_expression.h"
+#include "pass_level5/fuse_adjacent_reshape.h"
 #include "pass_level5/fuse_channel_shuffle.h"
 #include "pass_level5/fuse_constant_expression.h"
 #include "pass_level5/fuse_conv1d_batchnorm1d.h"
@@ -33,10 +34,19 @@
 #include "pass_level5/fuse_convtranspose2d_batchnorm2d.h"
 #include "pass_level5/fuse_contiguous_view.h"
 #include "pass_level5/fuse_linear_batchnorm1d.h"
+#include "pass_level5/fuse_pad_conv1d.h"
+#include "pass_level5/fuse_pad_conv2d.h"
 #include "pass_level5/fuse_select_to_unbind.h"
+#include "pass_level5/fuse_slice_copy.h"
 #include "pass_level5/fuse_slice_indices.h"
 #include "pass_level5/fuse_slice_to_tensor_split.h"
+#include "pass_level5/fuse_static_batchnorm.h"
 #include "pass_level5/fuse_static_conv.h"
+#include "pass_level5/fuse_static_convtranspose.h"
+#include "pass_level5/fuse_static_groupnorm.h"
+#include "pass_level5/fuse_static_instancenorm.h"
+#include "pass_level5/fuse_static_layernorm.h"
+#include "pass_level5/fuse_static_linear.h"
 #include "pass_level5/normalize_einsum_equation.h"
 #include "pass_level4/dead_code_elimination.h"
 #include "pass_level4/canonicalize.h"
@@ -44,15 +54,17 @@
 
 namespace pnnx {
 
-void pass_level5(Graph& g, const std::map<std::string, Attribute>& foldable_constants)
+void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath)
 {
     eval_expression(g);
 
     fuse_constant_expression(g);
 
+    fold_constants(g, foldable_constants, foldable_constants_zippath);
+
     eliminate_noop_expression(g);
 
-    eliminate_slice(g);
+    eliminate_noop_slice(g);
 
     fuse_slice_indices(g);
 
@@ -66,18 +78,26 @@ void pass_level5(Graph& g, const std::map<std::string, Attribute>& foldable_cons
 
     fuse_slice_to_tensor_split(g);
 
+    fuse_slice_copy(g);
+
+    fuse_static_batchnorm(g);
+    fuse_static_groupnorm(g);
+    fuse_static_instancenorm(g);
+    fuse_static_layernorm(g);
+
     fuse_static_conv(g);
+    fuse_static_convtranspose(g);
+    fuse_static_linear(g);
 
     fuse_conv1d_batchnorm1d(g);
-
     fuse_conv2d_batchnorm2d(g);
-
     fuse_convtranspose1d_batchnorm1d(g);
-
     fuse_convtranspose2d_batchnorm2d(g);
-
     fuse_linear_batchnorm1d(g);
 
+    fuse_pad_conv1d(g);
+    fuse_pad_conv2d(g);
+
     eliminate_noop_pad(g);
 
     eliminate_noop_cat(g);
@@ -88,11 +108,11 @@ void pass_level5(Graph& g, const std::map<std::string, Attribute>& foldable_cons
 
     fuse_contiguous_view(g);
 
-    eliminate_view_reshape(g);
+    fuse_adjacent_reshape(g);
 
-    fuse_channel_shuffle(g);
+    eliminate_noop_view_reshape(g);
 
-    fold_constants(g, foldable_constants);
+    fuse_channel_shuffle(g);
 
     fuse_index_expression(g);
 
diff --git a/tools/pnnx/src/pass_level5.h b/tools/pnnx/src/pass_level5.h
index fbf4ff48689..a040c7bf145 100644
--- a/tools/pnnx/src/pass_level5.h
+++ b/tools/pnnx/src/pass_level5.h
@@ -19,7 +19,7 @@
 
 namespace pnnx {
 
-void pass_level5(Graph& g, const std::map<std::string, Attribute>& foldable_constants);
+void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath);
 
 } // namespace pnnx
 
diff --git a/tools/pnnx/src/pass_level5/eliminate_slice.cpp b/tools/pnnx/src/pass_level5/eliminate_noop_slice.cpp
similarity index 94%
rename from tools/pnnx/src/pass_level5/eliminate_slice.cpp
rename to tools/pnnx/src/pass_level5/eliminate_noop_slice.cpp
index 62939be2c42..5e31b772897 100644
--- a/tools/pnnx/src/pass_level5/eliminate_slice.cpp
+++ b/tools/pnnx/src/pass_level5/eliminate_noop_slice.cpp
@@ -12,14 +12,15 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "eliminate_slice.h"
+#include "eliminate_noop_slice.h"
 
+#include <limits.h>
 #include <algorithm>
 #include "pass_level2.h"
 
 namespace pnnx {
 
-void eliminate_slice(Graph& graph)
+void eliminate_noop_slice(Graph& graph)
 {
     while (1)
     {
@@ -44,7 +45,7 @@ void eliminate_slice(Graph& graph)
             int end = op->params.at("end").i;
             int step = op->params.at("step").i;
 
-            if (start == 0 && end == -1 && step == 1)
+            if (start == 0 && end == INT_MAX && step == 1)
             {
                 // delete noop-like slice
                 matched = true;
diff --git a/tools/pnnx/src/pass_level5/eliminate_slice.h b/tools/pnnx/src/pass_level5/eliminate_noop_slice.h
similarity index 94%
rename from tools/pnnx/src/pass_level5/eliminate_slice.h
rename to tools/pnnx/src/pass_level5/eliminate_noop_slice.h
index a90ed96f4e9..162109d2a66 100644
--- a/tools/pnnx/src/pass_level5/eliminate_slice.h
+++ b/tools/pnnx/src/pass_level5/eliminate_noop_slice.h
@@ -16,6 +16,6 @@
 
 namespace pnnx {
 
-void eliminate_slice(Graph& graph);
+void eliminate_noop_slice(Graph& graph);
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/eliminate_view_reshape.cpp b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.cpp
similarity index 96%
rename from tools/pnnx/src/pass_level5/eliminate_view_reshape.cpp
rename to tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.cpp
index c3097bdb443..e6b00e87b2a 100644
--- a/tools/pnnx/src/pass_level5/eliminate_view_reshape.cpp
+++ b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.cpp
@@ -12,14 +12,14 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "eliminate_view_reshape.h"
+#include "eliminate_noop_view_reshape.h"
 
 #include <algorithm>
 #include "pass_level2.h"
 
 namespace pnnx {
 
-void eliminate_view_reshape(Graph& graph)
+void eliminate_noop_view_reshape(Graph& graph)
 {
     while (1)
     {
diff --git a/tools/pnnx/src/pass_level5/eliminate_view_reshape.h b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.h
similarity index 94%
rename from tools/pnnx/src/pass_level5/eliminate_view_reshape.h
rename to tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.h
index e3996354484..1d724d99c41 100644
--- a/tools/pnnx/src/pass_level5/eliminate_view_reshape.h
+++ b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.h
@@ -16,6 +16,6 @@
 
 namespace pnnx {
 
-void eliminate_view_reshape(Graph& graph);
+void eliminate_noop_view_reshape(Graph& graph);
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/eval_expression.cpp b/tools/pnnx/src/pass_level5/eval_expression.cpp
index 4261e5e5f25..7326f1b32c9 100644
--- a/tools/pnnx/src/pass_level5/eval_expression.cpp
+++ b/tools/pnnx/src/pass_level5/eval_expression.cpp
@@ -45,20 +45,14 @@ static bool token_is_literal(const std::string& t)
     float f;
     iss >> std::noskipws >> f;
     return iss.eof() && !iss.fail();
+}
 
-    //     for (size_t i = 0; i < t.size(); i++)
-    //     {
-    //         if (i == 0 && t[i] == '-')
-    //             continue;
-    //
-    //         if (t[i] < '0' || t[i] > '9')
-    //         {
-    //             if (t[i] != '.' && t[i] != 'e')
-    //                 return false;
-    //         }
-    //     }
-    //
-    //     return true;
+static bool token_is_interger_literal(const std::string& t)
+{
+    std::istringstream iss(t);
+    int f;
+    iss >> std::noskipws >> f;
+    return iss.eof() && !iss.fail();
 }
 
 static std::string eval_expression(const Operator* op)
@@ -126,7 +120,17 @@ static std::string eval_expression(const Operator* op)
                 {
                     int bi = std::stoi(b);
                     int r = op->inputs[input_index]->shape[bi];
-                    exprstack.push(std::to_string(r));
+                    if (r == -1)
+                    {
+                        // do not evaluate dynamic size info as -1
+                        // just keep the size expression
+                        std::string r = std::string("size(") + a + "," + b + ")";
+                        exprstack.push(r);
+                    }
+                    else
+                    {
+                        exprstack.push(std::to_string(r));
+                    }
                 }
             }
             else
@@ -135,7 +139,31 @@ static std::string eval_expression(const Operator* op)
                 exprstack.push(r);
             }
         }
-        else if (t == "int" || t == "sqrt" || t == "rsqrt" || t == "neg")
+        else if (t == "int"
+                 || t == "abs"
+                 || t == "acos"
+                 || t == "acosh"
+                 || t == "asin"
+                 || t == "asinh"
+                 || t == "atan"
+                 || t == "atanh"
+                 || t == "ceil"
+                 || t == "cos"
+                 || t == "cosh"
+                 || t == "exp"
+                 || t == "floor"
+                 || t == "log"
+                 || t == "neg"
+                 || t == "reciprocal"
+                 || t == "rsqrt"
+                 || t == "sign"
+                 || t == "sin"
+                 || t == "sinh"
+                 || t == "sqrt"
+                 || t == "square"
+                 || t == "tan"
+                 || t == "tanh"
+                 || t == "trunc")
         {
             std::string a = exprstack.top();
             exprstack.pop();
@@ -149,14 +177,69 @@ static std::string eval_expression(const Operator* op)
                     int r = int(af);
                     exprstack.push(std::to_string(r));
                 }
-                if (t == "sqrt")
+                if (t == "abs")
                 {
-                    float r = sqrt(af);
+                    float r = abs(af);
                     exprstack.push(std::to_string(r));
                 }
-                if (t == "rsqrt")
+                if (t == "acos")
                 {
-                    float r = 1.f / sqrt(af);
+                    float r = acos(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "acosh")
+                {
+                    float r = acosh(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "asin")
+                {
+                    float r = asin(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "asinh")
+                {
+                    float r = asinh(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "atan")
+                {
+                    float r = atan(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "atanh")
+                {
+                    float r = atanh(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "ceil")
+                {
+                    float r = ceil(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "cos")
+                {
+                    float r = cos(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "cosh")
+                {
+                    float r = cosh(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "exp")
+                {
+                    float r = exp(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "floor")
+                {
+                    float r = floor(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "log")
+                {
+                    float r = log(af);
                     exprstack.push(std::to_string(r));
                 }
                 if (t == "neg")
@@ -164,6 +247,56 @@ static std::string eval_expression(const Operator* op)
                     float r = -af;
                     exprstack.push(std::to_string(r));
                 }
+                if (t == "reciprocal")
+                {
+                    float r = 1.f / af;
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "rsqrt")
+                {
+                    float r = 1.f / sqrt(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "sign")
+                {
+                    float r = af > 0.f ? 1.f : (af == 0.f ? 0.f : -1.f);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "sin")
+                {
+                    float r = sin(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "sinh")
+                {
+                    float r = sinh(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "sqrt")
+                {
+                    float r = sqrt(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "square")
+                {
+                    float r = af * af;
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "tan")
+                {
+                    float r = tan(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "tanh")
+                {
+                    float r = tanh(af);
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "trunc")
+                {
+                    float r = trunc(af);
+                    exprstack.push(std::to_string(r));
+                }
             }
             else
             {
@@ -171,7 +304,14 @@ static std::string eval_expression(const Operator* op)
                 exprstack.push(r);
             }
         }
-        else if (t == "add" || t == "sub" || t == "mul" || t == "div" || t == "floor_divide" || t == "pow" || t == "remainder" || t == "and" || t == "or" || t == "xor")
+        else if (t == "atan2"
+                 || t == "add"
+                 || t == "sub"
+                 || t == "mul"
+                 || t == "div"
+                 || t == "floor_divide"
+                 || t == "pow"
+                 || t == "remainder")
         {
             std::string a = exprstack.top();
             exprstack.pop();
@@ -183,6 +323,11 @@ static std::string eval_expression(const Operator* op)
                 float af = std::stof(a);
                 float bf = std::stof(b);
 
+                if (t == "atan2")
+                {
+                    float r = atan2(af, bf);
+                    exprstack.push(std::to_string(r));
+                }
                 if (t == "add")
                 {
                     float r = af + bf;
@@ -227,6 +372,50 @@ static std::string eval_expression(const Operator* op)
                 exprstack.push(r);
             }
         }
+        else if (t == "and" || t == "or" || t == "xor" || t == "lshift" || t == "rshift")
+        {
+            std::string a = exprstack.top();
+            exprstack.pop();
+            std::string b = exprstack.top();
+            exprstack.pop();
+
+            if (token_is_interger_literal(a) && token_is_interger_literal(b))
+            {
+                int ai = std::stoi(a);
+                int bi = std::stoi(b);
+
+                if (t == "and")
+                {
+                    int r = ai & bi;
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "or")
+                {
+                    int r = ai | bi;
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "xor")
+                {
+                    int r = ai ^ bi;
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "lshift")
+                {
+                    int r = ai << bi;
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "rshift")
+                {
+                    int r = ai >> bi;
+                    exprstack.push(std::to_string(r));
+                }
+            }
+            else
+            {
+                std::string r = t + "(" + a + "," + b + ")";
+                exprstack.push(r);
+            }
+        }
         else if (t == "[") // list
         {
             std::vector<std::string> elements;
diff --git a/tools/pnnx/src/pass_level5/fold_constants.cpp b/tools/pnnx/src/pass_level5/fold_constants.cpp
index 51c8e71539f..e5bccd49827 100644
--- a/tools/pnnx/src/pass_level5/fold_constants.cpp
+++ b/tools/pnnx/src/pass_level5/fold_constants.cpp
@@ -15,12 +15,19 @@
 #include "fold_constants.h"
 #include <unordered_set>
 
+#include "storezip.h"
 #include "pass_level4/dead_code_elimination.h"
 
 namespace pnnx {
 
-void fold_constants(Graph& graph, const std::map<std::string, Attribute>& foldable_constants)
+void fold_constants(Graph& graph, const std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath)
 {
+    if (foldable_constants.empty())
+        return;
+
+    StoreZipReader zip;
+    zip.open(foldable_constants_zippath);
+
     for (size_t i = 0; i < graph.operands.size(); i++)
     {
         Operand* operand = graph.operands[i];
@@ -36,13 +43,23 @@ void fold_constants(Graph& graph, const std::map<std::string, Attribute>& foldab
         // replace producer with attribute
         Operator* op_new = graph.new_operator_before("pnnx.Attribute", std::string("pnnx_fold_") + name, op);
 
-        op_new->attrs[std::string("pnnx_fold_") + name] = foldable_constants.at(name);
+        op_new->attrs[std::string("pnnx_fold_") + name] = Attribute();
+
+        Attribute& t2 = op_new->attrs[std::string("pnnx_fold_") + name];
+        t2.type = operand->type;
+        t2.shape = operand->shape;
+        size_t size = zip.get_file_size(name);
+        t2.data.resize(size);
+        zip.read_file(name, t2.data.data());
+
         op_new->outputs.push_back(operand);
         operand->producer = op_new;
 
         op->outputs.clear();
     }
 
+    zip.close();
+
     // dce
     dead_code_elimination(graph);
 }
diff --git a/tools/pnnx/src/pass_level5/fold_constants.h b/tools/pnnx/src/pass_level5/fold_constants.h
index 6ebffbda064..0d96f9fbd0c 100644
--- a/tools/pnnx/src/pass_level5/fold_constants.h
+++ b/tools/pnnx/src/pass_level5/fold_constants.h
@@ -16,6 +16,6 @@
 
 namespace pnnx {
 
-void fold_constants(Graph& graph, const std::map<std::string, Attribute>& foldable_constants);
+void fold_constants(Graph& graph, const std::set<std::string>& foldable_constants, const std::string& foldable_constants_zippath);
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.cpp b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.cpp
new file mode 100644
index 00000000000..f8505072129
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.cpp
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_adjacent_reshape.h"
+
+#include <algorithm>
+#include "pass_level2.h"
+
+namespace pnnx {
+
+void fuse_adjacent_reshape(Graph& graph)
+{
+    while (1)
+    {
+        bool matched = false;
+
+        for (int i = (int)graph.ops.size() - 1; i > 0; i--)
+        {
+            Operator* op = graph.ops[i];
+
+            // look for Tensor.view / Tensor.reshape / torch.squeeze / torch.unsqueeze chain
+            if (op->type != "Tensor.view" && op->type != "Tensor.reshape" && op->type != "torch.squeeze" && op->type != "torch.unsqueeze")
+                continue;
+
+            if ((op->type == "torch.squeeze" || op->type == "torch.unsqueeze") && op->outputs[0]->shape.empty())
+                continue;
+
+            std::vector<Operator*> reshapes_to_delete;
+            const Operand* in0 = op->inputs[0];
+            while (in0->consumers.size() == 1 && (in0->producer->type == "Tensor.view" || in0->producer->type == "Tensor.reshape" || in0->producer->type == "torch.squeeze" || in0->producer->type == "torch.unsqueeze"))
+            {
+                reshapes_to_delete.push_back(in0->producer);
+                in0 = in0->producer->inputs[0];
+            }
+
+            if (reshapes_to_delete.empty())
+                continue;
+
+            // keep the last reshape only
+            matched = true;
+
+            op->type = "Tensor.reshape";
+
+            if (!op->outputs[0]->shape.empty())
+            {
+                op->params.clear();
+                op->params["shape"] = op->outputs[0]->shape;
+            }
+
+            for (auto& op0 : reshapes_to_delete)
+            {
+                for (auto& x : op0->inputs)
+                {
+                    x->remove_consumer(op0);
+                }
+
+                Operand* op0_in = op0->inputs[0];
+                Operand* op0_out = op0->outputs[0];
+
+                for (auto& x : op0_out->consumers)
+                {
+                    for (size_t j = 0; j < x->inputs.size(); j++)
+                    {
+                        if (x->inputs[j] == op0_out)
+                            x->inputs[j] = op0_in;
+                    }
+
+                    op0_in->consumers.push_back(x);
+                }
+
+                op0_in->name = op0_out->name;
+
+                op0_out->producer = 0;
+                op0_out->consumers.clear();
+
+                graph.operands.erase(std::find(graph.operands.begin(), graph.operands.end(), op0_out));
+                delete op0_out;
+
+                op0->inputs.clear();
+                op0->outputs.clear();
+
+                graph.ops.erase(std::find(graph.ops.begin(), graph.ops.end(), op0));
+                delete op0;
+            }
+
+            break;
+        }
+
+        if (!matched)
+            break;
+    }
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.h b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.h
similarity index 87%
rename from tools/pnnx/src/pass_ncnn/convert_to_fp16_model.h
rename to tools/pnnx/src/pass_level5/fuse_adjacent_reshape.h
index 3f609d30c4e..7f3fb51cdf3 100644
--- a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.h
+++ b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.h
@@ -12,14 +12,10 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "pass_ncnn.h"
+#include "ir.h"
 
 namespace pnnx {
 
-namespace ncnn {
-
-void convert_to_fp16_model(Graph& graph);
-
-} // namespace ncnn
+void fuse_adjacent_reshape(Graph& graph);
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv1d.cpp b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.cpp
new file mode 100644
index 00000000000..2f1260061b5
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.cpp
@@ -0,0 +1,401 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_pad_conv1d.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_pad_conv1d_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+F.pad                   op_pad      1 1 input a mode=constant pad=%pad value=%value
+nn.Conv1d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv1d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // constant-0 + zeros
+        float pad_value = 0.f;
+        if (captured_params.at("value").type == 2)
+            pad_value = captured_params.at("value").i;
+        if (captured_params.at("value").type == 3)
+            pad_value = captured_params.at("value").f;
+
+        if (pad_value != 0.f)
+            return false;
+
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 2)
+            return false;
+
+        if (pad.size() == 2 && pad[0] != pad[1])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        std::vector<int> padding = captured_params.at("padding").ai;
+        padding[0] += pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "zeros";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv1d_pass_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+F.pad                   op_pad      1 1 input a mode=%mode pad=%pad
+nn.Conv1d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv1d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // reflect/replicate + nopad
+        if (captured_params.at("mode").s != "reflect" && captured_params.at("mode").s != "replicate")
+            return false;
+
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 2)
+            return false;
+
+        if (pad.size() == 2 && pad[0] != pad[1])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        std::vector<int> padding(1);
+        padding[0] = pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = captured_params.at("mode");
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv1d_pass_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+nn.ConstantPad1d        op_pad      1 1 input a padding=%pad value=%value
+nn.Conv1d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv1d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // constant-0 + zeros
+        float pad_value = 0.f;
+        if (captured_params.at("value").type == 2)
+            pad_value = captured_params.at("value").i;
+        if (captured_params.at("value").type == 3)
+            pad_value = captured_params.at("value").f;
+
+        if (pad_value != 0.f)
+            return false;
+
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 2)
+            return false;
+
+        if (pad[0] != pad[1])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding = captured_params.at("padding").ai;
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        padding[0] += pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "zeros";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv1d_pass_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+nn.ReplicationPad1d     op_pad      1 1 input a padding=%pad
+nn.Conv1d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv1d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // replicate + nopad
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 2)
+            return false;
+
+        if (pad[0] != pad[1])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding(1);
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        padding[0] = pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "replicate";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv1d_pass_4 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+nn.ReflectionPad1d      op_pad      1 1 input a padding=%pad
+nn.Conv1d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv1d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // reflect + nopad
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 2)
+            return false;
+
+        if (pad[0] != pad[1])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding(1);
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        padding[0] = pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "reflect";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+void fuse_pad_conv1d(Graph& graph)
+{
+    fuse_pad_conv1d_pass a;
+    fuse_pad_conv1d_pass_1 b;
+    fuse_pad_conv1d_pass_2 c;
+    fuse_pad_conv1d_pass_3 d;
+    fuse_pad_conv1d_pass_4 e;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+    pnnx_graph_rewrite(graph, &c, opindex);
+    pnnx_graph_rewrite(graph, &d, opindex);
+    pnnx_graph_rewrite(graph, &e, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv1d.h b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.h
new file mode 100644
index 00000000000..f121b340cb0
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_pad_conv1d(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv2d.cpp b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.cpp
new file mode 100644
index 00000000000..3723ed9c0e9
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.cpp
@@ -0,0 +1,500 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_pad_conv2d.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_pad_conv2d_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+F.pad                   op_pad      1 1 input a mode=constant pad=%pad value=%value
+nn.Conv2d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv2d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // constant-0 + zeros
+        float pad_value = 0.f;
+        if (captured_params.at("value").type == 2)
+            pad_value = captured_params.at("value").i;
+        if (captured_params.at("value").type == 3)
+            pad_value = captured_params.at("value").f;
+
+        if (pad_value != 0.f)
+            return false;
+
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 2 && pad.size() != 4)
+            return false;
+
+        if (pad.size() == 2 && pad[0] != pad[1])
+            return false;
+
+        if (pad.size() == 4 && (pad[0] != pad[1] || pad[2] != pad[3]))
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        std::vector<int> padding = captured_params.at("padding").ai;
+
+        if (pad.size() == 2)
+        {
+            padding[1] += pad[0];
+        }
+        else if (pad.size() == 4)
+        {
+            padding[0] += pad[2];
+            padding[1] += pad[0];
+        }
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "zeros";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv2d_pass_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+F.pad                   op_pad      1 1 input a mode=%mode pad=%pad
+nn.Conv2d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv2d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // reflect/replicate + nopad
+        if (captured_params.at("mode").s != "reflect" && captured_params.at("mode").s != "replicate")
+            return false;
+
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 2 && pad.size() != 4)
+            return false;
+
+        if (pad.size() == 2 && pad[0] != pad[1])
+            return false;
+
+        if (pad.size() == 4 && (pad[0] != pad[1] || pad[2] != pad[3]))
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        std::vector<int> padding(2);
+
+        if (pad.size() == 2)
+        {
+            padding[0] = 0;
+            padding[1] = pad[0];
+        }
+        else if (pad.size() == 4)
+        {
+            padding[0] = pad[2];
+            padding[1] = pad[0];
+        }
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = captured_params.at("mode");
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv2d_pass_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+nn.ConstantPad2d        op_pad      1 1 input a padding=%pad value=%value
+nn.Conv2d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv2d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // constant-0 + zeros
+        float pad_value = 0.f;
+        if (captured_params.at("value").type == 2)
+            pad_value = captured_params.at("value").i;
+        if (captured_params.at("value").type == 3)
+            pad_value = captured_params.at("value").f;
+
+        if (pad_value != 0.f)
+            return false;
+
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 4)
+            return false;
+
+        if (pad[0] != pad[1] || pad[2] != pad[3])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding = captured_params.at("padding").ai;
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        padding[0] += pad[2];
+        padding[1] += pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "zeros";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv2d_pass_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+nn.ZeroPad2d            op_pad      1 1 input a padding=%pad
+nn.Conv2d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv2d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // constant-0 + zeros
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 4)
+            return false;
+
+        if (pad[0] != pad[1] || pad[2] != pad[3])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding = captured_params.at("padding").ai;
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        padding[0] += pad[2];
+        padding[1] += pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "zeros";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv2d_pass_4 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+nn.ReplicationPad2d     op_pad      1 1 input a padding=%pad
+nn.Conv2d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv2d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // replicate + nopad
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 4)
+            return false;
+
+        if (pad[0] != pad[1] || pad[2] != pad[3])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding(2);
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        padding[0] = pad[2];
+        padding[1] = pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "replicate";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+class fuse_pad_conv2d_pass_5 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+nn.ReflectionPad2d      op_pad      1 1 input a padding=%pad
+nn.Conv2d               op_0        1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv2d";
+    }
+
+    bool match_captured_params_attrs(const std::map<std::string, Parameter>& captured_params) const
+    {
+        // reflect + nopad
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        for (int x : pad)
+        {
+            if (x < 0)
+                return false;
+        }
+
+        if (pad.size() != 4)
+            return false;
+
+        if (pad[0] != pad[1] || pad[2] != pad[3])
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        std::vector<int> padding(2);
+        const std::vector<int>& pad = captured_params.at("pad").ai;
+        padding[0] = pad[2];
+        padding[1] = pad[0];
+
+        op->params["in_channels"] = captured_params.at("in_channels");
+        op->params["out_channels"] = captured_params.at("out_channels");
+        op->params["kernel_size"] = captured_params.at("kernel_size");
+        op->params["padding_mode"] = "reflect";
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = padding;
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = captured_params.at("bias");
+
+        op->attrs["weight"] = captured_attrs.at("op_0.weight");
+
+        if (captured_params.at("bias").b)
+        {
+            op->attrs["bias"] = captured_attrs.at("op_0.bias");
+        }
+    }
+};
+
+void fuse_pad_conv2d(Graph& graph)
+{
+    fuse_pad_conv2d_pass a;
+    fuse_pad_conv2d_pass_1 b;
+    fuse_pad_conv2d_pass_2 c;
+    fuse_pad_conv2d_pass_3 d;
+    fuse_pad_conv2d_pass_4 e;
+    fuse_pad_conv2d_pass_5 f;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+    pnnx_graph_rewrite(graph, &c, opindex);
+    pnnx_graph_rewrite(graph, &d, opindex);
+    pnnx_graph_rewrite(graph, &e, opindex);
+    pnnx_graph_rewrite(graph, &f, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv2d.h b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.h
new file mode 100644
index 00000000000..fb47be50ec7
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_pad_conv2d(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp b/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp
new file mode 100644
index 00000000000..0a5fabab7af
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp
@@ -0,0 +1,279 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_slice_copy.h"
+
+#include <limits.h>
+#include <algorithm>
+#include <stack>
+#include "pass_level2.h"
+
+namespace pnnx {
+
+void fuse_slice_copy(Graph& graph)
+{
+    while (1)
+    {
+        bool matched = false;
+
+        for (size_t i = 0; i < graph.ops.size(); i++)
+        {
+            Operator* op = graph.ops[i];
+
+            if (op->type != "Tensor.copy")
+                continue;
+
+            // collect slice / select op chain
+            std::stack<const Operator*> slice_select_ops;
+            int descent_dim_current = INT_MAX;
+            const Operand* in0 = op->inputs[0];
+            while (in0->producer->type == "Tensor.slice" || in0->producer->type == "Tensor.select")
+            {
+                const Operator* sop = in0->producer;
+                if (sop->type == "Tensor.slice")
+                {
+                    if (sop->params.find("dims") == sop->params.end()
+                            || sop->params.find("starts") == sop->params.end()
+                            || sop->params.find("ends") == sop->params.end()
+                            || sop->params.find("steps") == sop->params.end())
+                    {
+                        fprintf(stderr, "dynamic index in slice copy chain is not supported\n");
+                        break;
+                    }
+
+                    int dims0 = sop->params.at("dims").ai[0];
+                    if (descent_dim_current < dims0)
+                    {
+                        break;
+                    }
+
+                    descent_dim_current = dims0;
+                }
+
+                if (sop->type == "Tensor.select")
+                {
+                    if (sop->params.find("dim") == sop->params.end()
+                            || sop->params.find("index") == sop->params.end())
+                    {
+                        fprintf(stderr, "dynamic index in select copy chain is not supported\n");
+                        break;
+                    }
+
+                    int dim = sop->params.at("dim").i;
+                    if (descent_dim_current < dim)
+                    {
+                        break;
+                    }
+
+                    descent_dim_current = dim;
+                }
+
+                slice_select_ops.push(sop);
+                in0 = sop->inputs[0];
+            }
+
+            matched = true;
+
+            if (slice_select_ops.empty())
+            {
+                // eliminate noop copy
+                Operand* out = op->outputs[0];
+
+                for (auto& x : out->consumers)
+                {
+                    for (size_t j = 0; j < x->inputs.size(); j++)
+                    {
+                        if (x->inputs[j] == out)
+                            x->inputs[j] = op->inputs[1];
+                    }
+
+                    op->inputs[1]->consumers.push_back(x);
+                }
+
+                op->inputs[0]->remove_consumer(op);
+                op->inputs[1]->remove_consumer(op);
+
+                op->inputs[1]->name = out->name;
+
+                out->producer = 0;
+                out->consumers.clear();
+
+                graph.operands.erase(std::find(graph.operands.begin(), graph.operands.end(), out));
+                delete out;
+
+                op->inputs.clear();
+                op->outputs.clear();
+
+                graph.ops.erase(graph.ops.begin() + i);
+                delete op;
+
+                break;
+            }
+
+            const Operator* top_sop = slice_select_ops.top();
+
+            // construct one-step slice
+            std::vector<int> new_dims;
+            std::vector<int> new_starts;
+            std::vector<int> new_ends;
+            std::vector<int> new_steps;
+
+            int select_dims_offset = 0;
+            while (!slice_select_ops.empty())
+            {
+                const Operator* sop = slice_select_ops.top();
+                slice_select_ops.pop();
+
+                if (sop->type == "Tensor.slice")
+                {
+                    std::vector<int> dims = sop->params.at("dims").ai;
+                    std::vector<int> starts = sop->params.at("starts").ai;
+                    std::vector<int> ends = sop->params.at("ends").ai;
+                    std::vector<int> steps = sop->params.at("steps").ai;
+
+                    for (size_t j = 0; j < dims.size(); j++)
+                    {
+                        dims[j] += select_dims_offset;
+                    }
+
+                    new_dims.insert(new_dims.end(), dims.begin(), dims.end());
+                    new_starts.insert(new_starts.end(), starts.begin(), starts.end());
+                    new_ends.insert(new_ends.end(), ends.begin(), ends.end());
+                    new_steps.insert(new_steps.end(), steps.begin(), steps.end());
+                }
+                else if (sop->type == "Tensor.select")
+                {
+                    int dim = sop->params.at("dim").i;
+                    int index = sop->params.at("index").i;
+
+                    dim += select_dims_offset;
+                    int end = index + 1;
+                    if (index == -1)
+                        end = INT_MAX;
+
+                    new_dims.push_back(dim);
+                    new_starts.push_back(index);
+                    new_ends.push_back(end);
+                    new_steps.push_back(1);
+
+                    select_dims_offset += 1;
+                }
+            }
+
+            op->type = "Tensor.slice_copy";
+
+            // insert clone before any slices
+            Operator* op_clone = graph.new_operator_before("Tensor.clone", op->name + "_ncnnclone", top_sop);
+            Operand* clone_out = graph.new_operand(op->name + "_ncnnclone_out");
+
+            clone_out->shape = top_sop->inputs[0]->shape;
+
+            op_clone->inputs.push_back(top_sop->inputs[0]);
+            top_sop->inputs[0]->consumers.push_back(op_clone);
+
+            op_clone->outputs.push_back(clone_out);
+            clone_out->producer = op_clone;
+
+            op->inputs[0]->remove_consumer(op);
+            op->inputs[0] = clone_out;
+            clone_out->consumers.push_back(op);
+
+            op->params["dims"] = new_dims;
+            op->params["starts"] = new_starts;
+            op->params["ends"] = new_ends;
+            op->params["steps"] = new_steps;
+
+            int input_rank = (int)op->inputs[0]->shape.size();
+            if (input_rank == 0)
+            {
+                // insert view_as(sliced) for different or unknown rank
+                Operator* op_slice = graph.new_operator_before("Tensor.slice", op->name + "_ncnnslice", op);
+                Operator* op_view_as = graph.new_operator_before("Tensor.view_as", op->name + "_ncnnview_as", op);
+
+                Operand* slice_out = graph.new_operand(op->name + "_ncnnslice_out");
+                Operand* view_as_out = graph.new_operand(op->name + "_ncnnview_as_out");
+
+                op_slice->params["dims"] = new_dims;
+                op_slice->params["starts"] = new_starts;
+                op_slice->params["ends"] = new_ends;
+                op_slice->params["steps"] = new_steps;
+
+                op_slice->inputs.push_back(op->inputs[0]);
+                op->inputs[0]->consumers.push_back(op_slice);
+
+                op_slice->outputs.push_back(slice_out);
+                slice_out->producer = op_slice;
+
+                op_view_as->inputs.push_back(op->inputs[1]);
+                op->inputs[1]->consumers.push_back(op_view_as);
+                op->inputs[1]->remove_consumer(op);
+                op_view_as->inputs.push_back(slice_out);
+                slice_out->consumers.push_back(op_view_as);
+
+                op_view_as->outputs.push_back(view_as_out);
+                view_as_out->producer = op_view_as;
+
+                op->inputs[1] = view_as_out;
+                view_as_out->consumers.push_back(op);
+            }
+            else if (input_rank != (int)op->inputs[1]->shape.size())
+            {
+                // solve the target shape
+                std::vector<int> target_shape = op->inputs[0]->shape;
+                for (size_t j = 0; j < new_dims.size(); j++)
+                {
+                    int dim = new_dims[j];
+                    int start = new_starts[j];
+                    int end = new_ends[j];
+                    int step = new_steps[j];
+
+                    if (dim < 0)
+                        dim = input_rank + dim;
+                    if (start < 0)
+                        start = target_shape[dim] + start;
+                    if (end < 0)
+                        end = target_shape[dim] + end;
+                    if (end == INT_MAX)
+                        end = target_shape[dim];
+
+                    target_shape[dim] = (end - start + (step - 1)) / step;
+                }
+
+                Operator* op_view = graph.new_operator_before("Tensor.view", op->name + "_ncnnview", op);
+                Operand* view_out = graph.new_operand(op->name + "_ncnnview_out");
+
+                op_view->params["shape"] = target_shape;
+
+                view_out->shape = target_shape;
+
+                op_view->inputs.push_back(op->inputs[1]);
+                op->inputs[1]->consumers.push_back(op_view);
+                op->inputs[1]->remove_consumer(op);
+
+                op_view->outputs.push_back(view_out);
+                view_out->producer = op_view;
+
+                op->inputs[1] = view_out;
+                view_out->consumers.push_back(op);
+            }
+
+            break;
+        }
+
+        if (!matched)
+            break;
+    }
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_slice_copy.h b/tools/pnnx/src/pass_level5/fuse_slice_copy.h
new file mode 100644
index 00000000000..db3aef77359
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_slice_copy.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_slice_copy(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp b/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp
index 2162908b428..6ccacd5628f 100644
--- a/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp
@@ -14,6 +14,7 @@
 
 #include "fuse_slice_to_tensor_split.h"
 
+#include <limits.h>
 #include <algorithm>
 #include "pass_level2.h"
 
@@ -102,7 +103,7 @@ void fuse_slice_to_tensor_split(Graph& graph)
                     cur = op2;
 
                 int end2 = op2->params.at("ends").ai[0];
-                if (end2 == -1)
+                if (end2 == INT_MAX)
                 {
                     slice_n_ops.push_back(op2);
                     full_dimsize_slice = true;
diff --git a/tools/pnnx/src/pass_level5/fuse_static_batchnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.cpp
new file mode 100644
index 00000000000..0a3b9fbe405
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.cpp
@@ -0,0 +1,384 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_batchnorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Fbatchnorm_pass_1d : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_mean     0 1 running_mean @qwq
+pnnx.Attribute          op_var      0 1 running_var @qwq
+F.batchnorm             op_0        3 1 input running_mean running_var out weight=None bias=None eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm1d";
+    }
+
+    const char* name_str() const
+    {
+        return "batchnorm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 2 || input_rank == 3;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute running_mean;
+        Attribute running_var;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 8) == "op_mean.")
+                running_mean = x.second;
+            if (x.first.substr(0, 7) == "op_var.")
+                running_var = x.second;
+        }
+
+        op->params["num_features"] = running_mean.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = false;
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+    }
+};
+
+class fuse_static_Fbatchnorm_pass_1d_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+7 6
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_mean     0 1 running_mean @qwq
+pnnx.Attribute          op_var      0 1 running_var @qwq
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.batch_norm            op_0        5 1 input running_mean running_var weight bias out eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm1d";
+    }
+
+    const char* name_str() const
+    {
+        return "batchnorm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 2 || input_rank == 3;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute running_mean;
+        Attribute running_var;
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 8) == "op_mean.")
+                running_mean = x.second;
+            if (x.first.substr(0, 7) == "op_var.")
+                running_var = x.second;
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["num_features"] = running_mean.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = true;
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+class fuse_static_Fbatchnorm_pass_2d : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_mean     0 1 running_mean @qwq
+pnnx.Attribute          op_var      0 1 running_var @qwq
+F.batchnorm             op_0        3 1 input running_mean running_var out weight=None bias=None eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm2d";
+    }
+
+    const char* name_str() const
+    {
+        return "batchnorm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 4;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute running_mean;
+        Attribute running_var;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 8) == "op_mean.")
+                running_mean = x.second;
+            if (x.first.substr(0, 7) == "op_var.")
+                running_var = x.second;
+        }
+
+        op->params["num_features"] = running_mean.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = false;
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+    }
+};
+
+class fuse_static_Fbatchnorm_pass_2d_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+7 6
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_mean     0 1 running_mean @qwq
+pnnx.Attribute          op_var      0 1 running_var @qwq
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.batch_norm            op_0        5 1 input running_mean running_var weight bias out eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm2d";
+    }
+
+    const char* name_str() const
+    {
+        return "batchnorm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 4;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute running_mean;
+        Attribute running_var;
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 8) == "op_mean.")
+                running_mean = x.second;
+            if (x.first.substr(0, 7) == "op_var.")
+                running_var = x.second;
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["num_features"] = running_mean.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = true;
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+class fuse_static_Fbatchnorm_pass_3d : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_mean     0 1 running_mean @qwq
+pnnx.Attribute          op_var      0 1 running_var @qwq
+F.batchnorm             op_0        3 1 input running_mean running_var out weight=None bias=None eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm3d";
+    }
+
+    const char* name_str() const
+    {
+        return "batchnorm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 5;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute running_mean;
+        Attribute running_var;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 8) == "op_mean.")
+                running_mean = x.second;
+            if (x.first.substr(0, 7) == "op_var.")
+                running_var = x.second;
+        }
+
+        op->params["num_features"] = running_mean.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = false;
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+    }
+};
+
+class fuse_static_Fbatchnorm_pass_3d_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+7 6
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_mean     0 1 running_mean @qwq
+pnnx.Attribute          op_var      0 1 running_var @qwq
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.batch_norm            op_0        5 1 input running_mean running_var weight bias out eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm3d";
+    }
+
+    const char* name_str() const
+    {
+        return "batchnorm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 5;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute running_mean;
+        Attribute running_var;
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 8) == "op_mean.")
+                running_mean = x.second;
+            if (x.first.substr(0, 7) == "op_var.")
+                running_var = x.second;
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["num_features"] = running_mean.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = true;
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+void fuse_static_batchnorm(Graph& graph)
+{
+    fuse_static_Fbatchnorm_pass_1d a;
+    fuse_static_Fbatchnorm_pass_2d b;
+    fuse_static_Fbatchnorm_pass_3d c;
+    fuse_static_Fbatchnorm_pass_1d_1 a1;
+    fuse_static_Fbatchnorm_pass_2d_1 b1;
+    fuse_static_Fbatchnorm_pass_3d_1 c1;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+    pnnx_graph_rewrite(graph, &c, opindex);
+    pnnx_graph_rewrite(graph, &a1, opindex);
+    pnnx_graph_rewrite(graph, &b1, opindex);
+    pnnx_graph_rewrite(graph, &c1, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_batchnorm.h b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.h
new file mode 100644
index 00000000000..7ffc7ca2ce8
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_batchnorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_conv.cpp b/tools/pnnx/src/pass_level5/fuse_static_conv.cpp
index 7d5e256d9ac..6e29bcaaccc 100644
--- a/tools/pnnx/src/pass_level5/fuse_static_conv.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_static_conv.cpp
@@ -120,6 +120,82 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_static_Fconv1d_pass_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.conv1d                op_0        2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups
+pnnx.Expression         op_1        2 1 a bias out expr=%expr
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv1d";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::string& expr = captured_params.at("expr").s;
+        if (expr != "add(@0,@1)")
+            return false;
+
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        int out_channels = weight.shape[0];
+        if (bias.shape != std::vector<int>{1, out_channels, 1})
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["in_channels"] = weight.shape[1] * captured_params.at("groups").i;
+        op->params["out_channels"] = weight.shape[0];
+        op->params["kernel_size"] = std::vector<int>{weight.shape[2]};
+        op->params["padding_mode"] = std::string("zeros");
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
 class fuse_static_Fconv2d_pass : public GraphRewriterPass
 {
 public:
@@ -219,6 +295,82 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_static_Fconv2d_pass_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.conv2d                op_0        2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups
+pnnx.Expression         op_1        2 1 a bias out expr=%expr
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv2d";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::string& expr = captured_params.at("expr").s;
+        if (expr != "add(@0,@1)")
+            return false;
+
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        int out_channels = weight.shape[0];
+        if (bias.shape != std::vector<int>{1, out_channels, 1, 1})
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["in_channels"] = weight.shape[1] * captured_params.at("groups").i;
+        op->params["out_channels"] = weight.shape[0];
+        op->params["kernel_size"] = std::vector<int>{weight.shape[2], weight.shape[3]};
+        op->params["padding_mode"] = std::string("zeros");
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
 class fuse_static_Fconv3d_pass : public GraphRewriterPass
 {
 public:
@@ -318,8 +470,88 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_static_Fconv3d_pass_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.conv3d                op_0        2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups
+pnnx.Expression         op_1        2 1 a bias out expr=%expr
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv3d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv3d";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::string& expr = captured_params.at("expr").s;
+        if (expr != "add(@0,@1)")
+            return false;
+
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        int out_channels = weight.shape[0];
+        if (bias.shape != std::vector<int>{1, out_channels, 1, 1, 1})
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["in_channels"] = weight.shape[1] * captured_params.at("groups").i;
+        op->params["out_channels"] = weight.shape[0];
+        op->params["kernel_size"] = std::vector<int>{weight.shape[2], weight.shape[3], weight.shape[4]};
+        op->params["padding_mode"] = std::string("zeros");
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["groups"] = captured_params.at("groups");
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
 void fuse_static_conv(Graph& graph)
 {
+    fuse_static_Fconv1d_pass_3 a3;
+    fuse_static_Fconv2d_pass_3 a4;
+    fuse_static_Fconv3d_pass_3 a5;
+
     fuse_static_Fconv1d_pass a;
     fuse_static_Fconv1d_pass_2 b;
     fuse_static_Fconv2d_pass c;
@@ -328,6 +560,10 @@ void fuse_static_conv(Graph& graph)
     fuse_static_Fconv3d_pass_2 f;
     int opindex = 0;
 
+    pnnx_graph_rewrite(graph, &a3, opindex);
+    pnnx_graph_rewrite(graph, &a4, opindex);
+    pnnx_graph_rewrite(graph, &a5, opindex);
+
     pnnx_graph_rewrite(graph, &a, opindex);
     pnnx_graph_rewrite(graph, &b, opindex);
     pnnx_graph_rewrite(graph, &c, opindex);
diff --git a/tools/pnnx/src/pass_level5/fuse_static_convtranspose.cpp b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.cpp
new file mode 100644
index 00000000000..6f6e164952a
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.cpp
@@ -0,0 +1,351 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_convtranspose.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Fconvtranspose1d_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+F.conv_transpose1d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.ConvTranspose1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv_transpose1d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+        }
+
+        const int groups = captured_params.at("groups").i;
+
+        op->params["groups"] = groups;
+        op->params["in_channels"] = weight.shape[0];
+        op->params["out_channels"] = weight.shape[1] * groups;
+        op->params["kernel_size"] = Parameter{weight.shape[2]};
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["output_padding"] = captured_params.at("output_padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["bias"] = false;
+
+        op->attrs["weight"] = weight;
+    }
+};
+
+class fuse_static_Fconvtranspose1d_pass_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.conv_transpose1d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.ConvTranspose1d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv_transpose1d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        const int groups = captured_params.at("groups").i;
+
+        op->params["groups"] = groups;
+        op->params["in_channels"] = weight.shape[0];
+        op->params["out_channels"] = weight.shape[1] * groups;
+        op->params["kernel_size"] = Parameter{weight.shape[2]};
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["output_padding"] = captured_params.at("output_padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+class fuse_static_Fconvtranspose2d_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+F.conv_transpose2d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.ConvTranspose2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv_transpose2d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+        }
+
+        const int groups = captured_params.at("groups").i;
+
+        op->params["groups"] = groups;
+        op->params["in_channels"] = weight.shape[0];
+        op->params["out_channels"] = weight.shape[1] * groups;
+        op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3]};
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["output_padding"] = captured_params.at("output_padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["bias"] = false;
+
+        op->attrs["weight"] = weight;
+    }
+};
+
+class fuse_static_Fconvtranspose2d_pass_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.conv_transpose2d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.ConvTranspose2d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv_transpose2d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        const int groups = captured_params.at("groups").i;
+
+        op->params["groups"] = groups;
+        op->params["in_channels"] = weight.shape[0];
+        op->params["out_channels"] = weight.shape[1] * groups;
+        op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3]};
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["output_padding"] = captured_params.at("output_padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+class fuse_static_Fconvtranspose3d_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+F.conv_transpose3d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.ConvTranspose3d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv_transpose3d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+        }
+
+        const int groups = captured_params.at("groups").i;
+
+        op->params["groups"] = groups;
+        op->params["in_channels"] = weight.shape[0];
+        op->params["out_channels"] = weight.shape[1] * groups;
+        op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3], weight.shape[4]};
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["output_padding"] = captured_params.at("output_padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["bias"] = false;
+
+        op->attrs["weight"] = weight;
+    }
+};
+
+class fuse_static_Fconvtranspose3d_pass_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.conv_transpose3d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.ConvTranspose3d";
+    }
+
+    const char* name_str() const
+    {
+        return "conv_transpose3d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        const int groups = captured_params.at("groups").i;
+
+        op->params["groups"] = groups;
+        op->params["in_channels"] = weight.shape[0];
+        op->params["out_channels"] = weight.shape[1] * groups;
+        op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3], weight.shape[4]};
+        op->params["stride"] = captured_params.at("stride");
+        op->params["padding"] = captured_params.at("padding");
+        op->params["output_padding"] = captured_params.at("output_padding");
+        op->params["dilation"] = captured_params.at("dilation");
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+void fuse_static_convtranspose(Graph& graph)
+{
+    fuse_static_Fconvtranspose1d_pass a;
+    fuse_static_Fconvtranspose1d_pass_2 b;
+    fuse_static_Fconvtranspose2d_pass c;
+    fuse_static_Fconvtranspose2d_pass_2 d;
+    fuse_static_Fconvtranspose3d_pass e;
+    fuse_static_Fconvtranspose3d_pass_2 f;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+    pnnx_graph_rewrite(graph, &c, opindex);
+    pnnx_graph_rewrite(graph, &d, opindex);
+    pnnx_graph_rewrite(graph, &e, opindex);
+    pnnx_graph_rewrite(graph, &f, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_convtranspose.h b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.h
new file mode 100644
index 00000000000..2474074a150
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_convtranspose(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_groupnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.cpp
new file mode 100644
index 00000000000..203168e2596
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.cpp
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_groupnorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Fgroupnorm_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.group_norm            op_0        3 1 input weight bias out num_groups=%num_groups eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.GroupNorm";
+    }
+
+    const char* name_str() const
+    {
+        return "group_norm";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["num_channels"] = weight.shape[0];
+        op->params["num_groups"] = captured_params.at("num_groups");
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+void fuse_static_groupnorm(Graph& graph)
+{
+    fuse_static_Fgroupnorm_pass a;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_groupnorm.h b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.h
new file mode 100644
index 00000000000..2de65fa307b
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_groupnorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_instancenorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.cpp
new file mode 100644
index 00000000000..5bf08017f6d
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.cpp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_instancenorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Finstancenorm_pass_1d : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.instance_norm         op_0        3 1 input weight bias out running_mean=None running_var=None eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.InstanceNorm1d";
+    }
+
+    const char* name_str() const
+    {
+        return "instance_norm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 2 || input_rank == 3;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["num_features"] = weight.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = true;
+        op->params["track_running_stats"] = false;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+class fuse_static_Finstancenorm_pass_2d : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.instance_norm         op_0        3 1 input weight bias out running_mean=None running_var=None eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.InstanceNorm1d";
+    }
+
+    const char* name_str() const
+    {
+        return "instance_norm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 4;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["num_features"] = weight.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = true;
+        op->params["track_running_stats"] = false;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+class fuse_static_Finstancenorm_pass_3d : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.instance_norm         op_0        3 1 input weight bias out running_mean=None running_var=None eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.InstanceNorm1d";
+    }
+
+    const char* name_str() const
+    {
+        return "instance_norm";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators) const
+    {
+        int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size();
+        return input_rank == 5;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["num_features"] = weight.shape[0];
+        op->params["eps"] = captured_params.at("eps");
+        op->params["affine"] = true;
+        op->params["track_running_stats"] = false;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+void fuse_static_instancenorm(Graph& graph)
+{
+    fuse_static_Finstancenorm_pass_1d a;
+    fuse_static_Finstancenorm_pass_2d b;
+    fuse_static_Finstancenorm_pass_3d c;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+    pnnx_graph_rewrite(graph, &c, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_instancenorm.h b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.h
new file mode 100644
index 00000000000..df71b0e52a7
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_instancenorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_layernorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_layernorm.cpp
new file mode 100644
index 00000000000..d6c494f089d
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_layernorm.cpp
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_layernorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Flayernorm_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.layer_norm            op_0        3 1 input weight bias out normalized_shape=%normalized_shape eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.LayerNorm";
+    }
+
+    const char* name_str() const
+    {
+        return "layer_norm";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["normalized_shape"] = captured_params.at("normalized_shape");
+        op->params["eps"] = captured_params.at("eps");
+        op->params["elementwise_affine"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+void fuse_static_layernorm(Graph& graph)
+{
+    fuse_static_Flayernorm_pass a;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_layernorm.h b/tools/pnnx/src/pass_level5/fuse_static_layernorm.h
new file mode 100644
index 00000000000..e61f254d2b5
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_layernorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_layernorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_linear.cpp b/tools/pnnx/src/pass_level5/fuse_static_linear.cpp
new file mode 100644
index 00000000000..a34177e20ee
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_linear.cpp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_linear.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Flinear_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+F.linear                op_0        2 1 input weight out bias=None
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Linear";
+    }
+
+    const char* name_str() const
+    {
+        return "linear";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+        }
+
+        op->params["in_features"] = weight.shape[1];
+        op->params["out_features"] = weight.shape[0];
+        op->params["bias"] = false;
+
+        op->attrs["weight"] = weight;
+    }
+};
+
+class fuse_static_Flinear_pass_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.linear                op_0        3 1 input weight bias out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Linear";
+    }
+
+    const char* name_str() const
+    {
+        return "linear";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["in_features"] = weight.shape[1];
+        op->params["out_features"] = weight.shape[0];
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+class fuse_static_Flinear_pass_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @qwq
+pnnx.Attribute          op_bias     0 1 bias @qwq
+F.linear                op_0        2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups
+pnnx.Expression         op_1        2 1 a bias out expr=%expr
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Linear";
+    }
+
+    const char* name_str() const
+    {
+        return "linear";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::string& expr = captured_params.at("expr").s;
+        if (expr != "add(@0,@1)")
+            return false;
+
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        int out_channels = weight.shape[0];
+        if (bias.shape != std::vector<int>{1, out_channels, 1})
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        Attribute weight;
+        Attribute bias;
+        for (const auto& x : captured_attrs)
+        {
+            if (x.first.substr(0, 10) == "op_weight.")
+                weight = x.second;
+            if (x.first.substr(0, 8) == "op_bias.")
+                bias = x.second;
+        }
+
+        op->params["in_features"] = weight.shape[1];
+        op->params["out_features"] = weight.shape[0];
+        op->params["bias"] = true;
+
+        op->attrs["weight"] = weight;
+        op->attrs["bias"] = bias;
+    }
+};
+
+void fuse_static_linear(Graph& graph)
+{
+    fuse_static_Flinear_pass_3 a3;
+
+    fuse_static_Flinear_pass a;
+    fuse_static_Flinear_pass_2 b;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a3, opindex);
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_linear.h b/tools/pnnx/src/pass_level5/fuse_static_linear.h
new file mode 100644
index 00000000000..8c26f924c16
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_linear.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_linear(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp b/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp
index c832353be22..2fda0242309 100644
--- a/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp
+++ b/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp
@@ -42,6 +42,7 @@ void unroll_rnn_op(Graph& graph)
             bool has_output_hidden = op->outputs.size() >= 2;
             bool has_output_cell = op->outputs.size() == 3;
             const int hidden_size = op->params["hidden_size"].i;
+            const int proj_size = (op->type == "nn.LSTM") ? op->params["proj_size"].i : 0;
             bool has_bias = op->params["bias"].b;
             bool is_bidirectional = op->params["bidirectional"].b;
 
@@ -116,7 +117,14 @@ void unroll_rnn_op(Graph& graph)
                 }
                 else
                 {
-                    op1->params["input_size"] = is_bidirectional ? hidden_size * 2 : hidden_size;
+                    if (proj_size)
+                    {
+                        op1->params["input_size"] = is_bidirectional ? proj_size * 2 : proj_size;
+                    }
+                    else
+                    {
+                        op1->params["input_size"] = is_bidirectional ? hidden_size * 2 : hidden_size;
+                    }
 
                     op1->inputs.push_back(unrolled_ops[j - 1]->outputs[0]);
                     op1->inputs[0]->consumers.push_back(op1);
@@ -171,6 +179,11 @@ void unroll_rnn_op(Graph& graph)
                     op1->attrs["bias_ih_l0"] = op->attrs["bias_ih_l" + std::to_string(j)];
                 }
 
+                if (proj_size)
+                {
+                    op1->attrs["weight_hr_l0"] = op->attrs["weight_hr_l" + std::to_string(j)];
+                }
+
                 if (is_bidirectional)
                 {
                     op1->attrs["weight_hh_l0_reverse"] = op->attrs["weight_hh_l" + std::to_string(j) + "_reverse"];
@@ -181,6 +194,11 @@ void unroll_rnn_op(Graph& graph)
                         op1->attrs["bias_hh_l0_reverse"] = op->attrs["bias_hh_l" + std::to_string(j) + "_reverse"];
                         op1->attrs["bias_ih_l0_reverse"] = op->attrs["bias_ih_l" + std::to_string(j) + "_reverse"];
                     }
+
+                    if (proj_size)
+                    {
+                        op1->attrs["weight_hr_l0_reverse"] = op->attrs["weight_hr_l" + std::to_string(j) + "_reverse"];
+                    }
                 }
 
                 unrolled_ops[j] = op1;
diff --git a/tools/pnnx/src/pass_ncnn.cpp b/tools/pnnx/src/pass_ncnn.cpp
index 603b6f20705..309692f2942 100644
--- a/tools/pnnx/src/pass_ncnn.cpp
+++ b/tools/pnnx/src/pass_ncnn.cpp
@@ -31,7 +31,6 @@
 #include "pass_ncnn/insert_split.h"
 #include "pass_ncnn/chain_multi_output.h"
 #include "pass_ncnn/solve_batch_index.h"
-#include "pass_ncnn/convert_to_fp16_model.h"
 
 #include "pass_ncnn/eliminate_noop.h"
 #include "pass_ncnn/eliminate_tail_reshape_permute.h"
@@ -137,8 +136,6 @@ void pass_ncnn(Graph& g)
     ncnn::convert_input(g);
 
     ncnn::eliminate_output(g);
-
-    ncnn::convert_to_fp16_model(g);
 }
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_conv1d.cpp b/tools/pnnx/src/pass_ncnn/F_conv1d.cpp
index 0d969caca48..c861842b95f 100644
--- a/tools/pnnx/src/pass_ncnn/F_conv1d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_conv1d.cpp
@@ -18,254 +18,6 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_conv1d : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv1d                op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Convolution1D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d, 20)
-
-class F_conv1d_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv1d                op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Convolution1D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d_1, 20)
-
-class F_conv1d_2 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv1d                op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "ConvolutionDepthWise1D";
-    }
-
-    const char* name_str() const
-    {
-        return "convdw1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = captured_params.at("groups");
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d_2, 21)
-
-class F_conv1d_3 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv1d                op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "ConvolutionDepthWise1D";
-    }
-
-    const char* name_str() const
-    {
-        return "convdw1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = captured_params.at("groups");
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d_3, 21)
-
 class F_conv1d_4 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_ncnn/F_conv2d.cpp b/tools/pnnx/src/pass_ncnn/F_conv2d.cpp
index 0814a470957..8480b80aa28 100644
--- a/tools/pnnx/src/pass_ncnn/F_conv2d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_conv2d.cpp
@@ -18,270 +18,6 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_conv2d : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv2d                op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Convolution";
-    }
-
-    const char* name_str() const
-    {
-        return "conv2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[1];
-            op->params["14"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d, 20)
-
-class F_conv2d_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv2d                op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Convolution";
-    }
-
-    const char* name_str() const
-    {
-        return "conv2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[1];
-            op->params["14"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d_1, 20)
-
-class F_conv2d_2 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv2d                op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "ConvolutionDepthWise";
-    }
-
-    const char* name_str() const
-    {
-        return "convdw2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[1];
-            op->params["14"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = captured_params.at("groups");
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d_2, 21)
-
-class F_conv2d_3 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv2d                op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "ConvolutionDepthWise";
-    }
-
-    const char* name_str() const
-    {
-        return "convdw2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[1];
-            op->params["14"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = captured_params.at("groups");
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d_3, 21)
-
 class F_conv2d_4 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_ncnn/F_conv3d.cpp b/tools/pnnx/src/pass_ncnn/F_conv3d.cpp
index 317e220a0b2..890f36cc92a 100644
--- a/tools/pnnx/src/pass_ncnn/F_conv3d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_conv3d.cpp
@@ -18,286 +18,6 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_conv3d : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv3d                op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Convolution3D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[2];
-            op->params["14"] = captured_params.at("padding").ai[1];
-            op->params["24"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d, 20)
-
-class F_conv3d_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv3d                op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Convolution3D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[2];
-            op->params["14"] = captured_params.at("padding").ai[1];
-            op->params["24"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d_1, 20)
-
-class F_conv3d_2 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv3d                op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "ConvolutionDepthWise3D";
-    }
-
-    const char* name_str() const
-    {
-        return "convdw3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[2];
-            op->params["14"] = captured_params.at("padding").ai[1];
-            op->params["24"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = captured_params.at("groups");
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d_2, 21)
-
-class F_conv3d_3 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv3d                op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "ConvolutionDepthWise3D";
-    }
-
-    const char* name_str() const
-    {
-        return "convdw3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        if (captured_params.at("padding").type == 4)
-        {
-            if (captured_params.at("padding").s == "same")
-                op->params["4"] = -233;
-            else if (captured_params.at("padding").s == "valid")
-                op->params["4"] = 0;
-        }
-        else
-        {
-            op->params["4"] = captured_params.at("padding").ai[2];
-            op->params["14"] = captured_params.at("padding").ai[1];
-            op->params["24"] = captured_params.at("padding").ai[0];
-        }
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = captured_params.at("groups");
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d_3, 21)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp b/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp
index fd121d3c229..5901522afca 100644
--- a/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp
@@ -18,332 +18,6 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_conv_transpose1d : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv_transpose1d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Deconvolution1D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv_transpose1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[1];
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        // transpose inch-outch-kw to outch-inch-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1];
-        const int kw = weight.shape[2];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch * inch * kw);
-            float* w2 = (float*)new_weight.data();
-
-            // reorder weight from inch-outch to outch-inch
-            for (int i = 0; i < outch; i++)
-            {
-                for (int j = 0; j < inch; j++)
-                {
-                    for (int k = 0; k < kw; k++)
-                    {
-                        w2[(i * inch + j) * kw + k] = w[(j * outch + i) * kw + k];
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch, inch, kw}, new_weight);
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d, 20)
-
-class F_conv_transpose1d_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv_transpose1d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Deconvolution1D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv_transpose1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[1];
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        // transpose inch-outch-kw to outch-inch-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1];
-        const int kw = weight.shape[2];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch * inch * kw);
-            float* w2 = (float*)new_weight.data();
-
-            // reorder weight from inch-outch to outch-inch
-            for (int i = 0; i < outch; i++)
-            {
-                for (int j = 0; j < inch; j++)
-                {
-                    for (int k = 0; k < kw; k++)
-                    {
-                        w2[(i * inch + j) * kw + k] = w[(j * outch + i) * kw + k];
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch, inch, kw}, new_weight);
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d_1, 20)
-
-class F_conv_transpose1d_2 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv_transpose1d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "DeconvolutionDepthWise1D";
-    }
-
-    const char* name_str() const
-    {
-        return "deconvdw1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        const int groups = captured_params.at("groups").i;
-
-        op->params["0"] = weight.shape[1] * groups;
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = groups;
-
-        // transpose group-inch/group-outch/group-kw to group-outch/group-inch/group-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1] * groups;
-        const int kw = weight.shape[2];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch / groups * inch * kw);
-            float* w2 = (float*)new_weight.data();
-            const int outch_g = outch / groups;
-            const int inch_g = inch / groups;
-
-            for (int g = 0; g < groups; g++)
-            {
-                // reorder weight from inch-outch to outch-inch
-                float* wg2 = w2 + g * outch_g * inch_g * kw;
-                const float* wg = w + g * inch_g * outch_g * kw;
-                for (int i = 0; i < outch_g; i++)
-                {
-                    for (int j = 0; j < inch_g; j++)
-                    {
-                        for (int k = 0; k < kw; k++)
-                        {
-                            wg2[(i * inch_g + j) * kw + k] = wg[(j * outch_g + i) * kw + k];
-                        }
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch / groups, inch, kw}, new_weight);
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d_2, 21)
-
-class F_conv_transpose1d_3 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv_transpose1d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "DeconvolutionDepthWise1D";
-    }
-
-    const char* name_str() const
-    {
-        return "deconvdw1d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        const int groups = captured_params.at("groups").i;
-
-        op->params["0"] = weight.shape[1] * groups;
-        op->params["1"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = groups;
-
-        // transpose group-inch/group-outch/group-kw to group-outch/group-inch/group-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1] * groups;
-        const int kw = weight.shape[2];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch / groups * inch * kw);
-            float* w2 = (float*)new_weight.data();
-            const int outch_g = outch / groups;
-            const int inch_g = inch / groups;
-
-            for (int g = 0; g < groups; g++)
-            {
-                // reorder weight from inch-outch to outch-inch
-                float* wg2 = w2 + g * outch_g * inch_g * kw;
-                const float* wg = w + g * inch_g * outch_g * kw;
-                for (int i = 0; i < outch_g; i++)
-                {
-                    for (int j = 0; j < inch_g; j++)
-                    {
-                        for (int k = 0; k < kw; k++)
-                        {
-                            wg2[(i * inch_g + j) * kw + k] = wg[(j * outch_g + i) * kw + k];
-                        }
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch / groups, inch, kw}, new_weight);
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d_3, 21)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp b/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp
index fc9f9e75fac..890f36cc92a 100644
--- a/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp
@@ -18,360 +18,6 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_conv_transpose2d : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv_transpose2d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Deconvolution";
-    }
-
-    const char* name_str() const
-    {
-        return "conv_transpose2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[1];
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[1];
-        op->params["14"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[1];
-        op->params["19"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        // transpose inch-outch-kh-kw to outch-inch-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1];
-        const int kh = weight.shape[2];
-        const int kw = weight.shape[3];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch * inch * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int maxk = kh * kw;
-
-            // reorder weight from inch-outch to outch-inch
-            for (int i = 0; i < outch; i++)
-            {
-                for (int j = 0; j < inch; j++)
-                {
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k];
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch, inch, kh, kw}, new_weight);
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d, 20)
-
-class F_conv_transpose2d_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv_transpose2d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Deconvolution";
-    }
-
-    const char* name_str() const
-    {
-        return "conv_transpose2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[1];
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[1];
-        op->params["14"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[1];
-        op->params["19"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        // transpose inch-outch-kh-kw to outch-inch-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1];
-        const int kh = weight.shape[2];
-        const int kw = weight.shape[3];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch * inch * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int maxk = kh * kw;
-
-            // reorder weight from inch-outch to outch-inch
-            for (int i = 0; i < outch; i++)
-            {
-                for (int j = 0; j < inch; j++)
-                {
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k];
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch, inch, kh, kw}, new_weight);
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d_1, 20)
-
-class F_conv_transpose2d_2 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv_transpose2d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "DeconvolutionDepthWise";
-    }
-
-    const char* name_str() const
-    {
-        return "deconvdw2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        const int groups = captured_params.at("groups").i;
-
-        op->params["0"] = weight.shape[1] * groups;
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[1];
-        op->params["14"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[1];
-        op->params["19"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = groups;
-
-        // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1] * groups;
-        const int kh = weight.shape[2];
-        const int kw = weight.shape[3];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch / groups * inch * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int outch_g = outch / groups;
-            const int inch_g = inch / groups;
-            const int maxk = kh * kw;
-
-            for (int g = 0; g < groups; g++)
-            {
-                // reorder weight from inch-outch to outch-inch
-                float* wg2 = w2 + g * outch_g * inch_g * maxk;
-                const float* wg = w + g * inch_g * outch_g * maxk;
-                for (int i = 0; i < outch_g; i++)
-                {
-                    for (int j = 0; j < inch_g; j++)
-                    {
-                        for (int k = 0; k < maxk; k++)
-                        {
-                            wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
-                        }
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch / groups, inch, kh, kw}, new_weight);
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d_2, 21)
-
-class F_conv_transpose2d_3 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv_transpose2d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "DeconvolutionDepthWise";
-    }
-
-    const char* name_str() const
-    {
-        return "deconvdw2d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        const int groups = captured_params.at("groups").i;
-
-        op->params["0"] = weight.shape[1] * groups;
-        op->params["1"] = weight.shape[3];
-        op->params["11"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[1];
-        op->params["12"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[1];
-        op->params["13"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[1];
-        op->params["14"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[1];
-        op->params["19"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = groups;
-
-        // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1] * groups;
-        const int kh = weight.shape[2];
-        const int kw = weight.shape[3];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch / groups * inch * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int outch_g = outch / groups;
-            const int inch_g = inch / groups;
-            const int maxk = kh * kw;
-
-            for (int g = 0; g < groups; g++)
-            {
-                // reorder weight from inch-outch to outch-inch
-                float* wg2 = w2 + g * outch_g * inch_g * maxk;
-                const float* wg = w + g * inch_g * outch_g * maxk;
-                for (int i = 0; i < outch_g; i++)
-                {
-                    for (int j = 0; j < inch_g; j++)
-                    {
-                        for (int k = 0; k < maxk; k++)
-                        {
-                            wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
-                        }
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch / groups, inch, kh, kw}, new_weight);
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d_3, 21)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp b/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp
index 80017555231..890f36cc92a 100644
--- a/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp
@@ -18,384 +18,6 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_conv_transpose3d : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv_transpose3d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Deconvolution3D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv_transpose3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[1];
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[2];
-        op->params["14"] = captured_params.at("padding").ai[1];
-        op->params["24"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[2];
-        op->params["19"] = captured_params.at("output_padding").ai[1];
-        op->params["20"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        // transpose inch-outch-kd-kh-kw to outch-inch-kd-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1];
-        const int kd = weight.shape[2];
-        const int kh = weight.shape[3];
-        const int kw = weight.shape[4];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch * inch * kd * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int maxk = kd * kh * kw;
-
-            // reorder weight from inch-outch to outch-inch
-            for (int i = 0; i < outch; i++)
-            {
-                for (int j = 0; j < inch; j++)
-                {
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k];
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch, inch, kd, kh, kw}, new_weight);
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d, 20)
-
-class F_conv_transpose3d_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv_transpose3d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Deconvolution3D";
-    }
-
-    const char* name_str() const
-    {
-        return "conv_transpose3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[1];
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[2];
-        op->params["14"] = captured_params.at("padding").ai[1];
-        op->params["24"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[2];
-        op->params["19"] = captured_params.at("output_padding").ai[1];
-        op->params["20"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-
-        // transpose inch-outch-kd-kh-kw to outch-inch-kd-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1];
-        const int kd = weight.shape[2];
-        const int kh = weight.shape[3];
-        const int kw = weight.shape[4];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch * inch * kd * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int maxk = kd * kh * kw;
-
-            // reorder weight from inch-outch to outch-inch
-            for (int i = 0; i < outch; i++)
-            {
-                for (int j = 0; j < inch; j++)
-                {
-                    for (int k = 0; k < maxk; k++)
-                    {
-                        w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k];
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch, inch, kd, kh, kw}, new_weight);
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d_1, 20)
-
-class F_conv_transpose3d_2 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.conv_transpose3d      op_0        2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "DeconvolutionDepthWise3D";
-    }
-
-    const char* name_str() const
-    {
-        return "deconvdw3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        const int groups = captured_params.at("groups").i;
-
-        op->params["0"] = weight.shape[1] * groups;
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[2];
-        op->params["14"] = captured_params.at("padding").ai[1];
-        op->params["24"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[2];
-        op->params["19"] = captured_params.at("output_padding").ai[1];
-        op->params["20"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 0;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = groups;
-
-        // transpose group-inch/group-outch/group-kd-kh-kw to group-outch/group-inch/group-kd-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1] * groups;
-        const int kd = weight.shape[2];
-        const int kh = weight.shape[3];
-        const int kw = weight.shape[4];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch / groups * inch * kd * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int outch_g = outch / groups;
-            const int inch_g = inch / groups;
-            const int maxk = kd * kh * kw;
-
-            for (int g = 0; g < groups; g++)
-            {
-                // reorder weight from inch-outch to outch-inch
-                float* wg2 = w2 + g * outch_g * inch_g * maxk;
-                const float* wg = w + g * inch_g * outch_g * maxk;
-                for (int i = 0; i < outch_g; i++)
-                {
-                    for (int j = 0; j < inch_g; j++)
-                    {
-                        for (int k = 0; k < maxk; k++)
-                        {
-                            wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
-                        }
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch / groups, inch, kd, kh, kw}, new_weight);
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d_2, 21)
-
-class F_conv_transpose3d_3 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.conv_transpose3d      op_0        3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "DeconvolutionDepthWise3D";
-    }
-
-    const char* name_str() const
-    {
-        return "deconvdw3d";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        const int groups = captured_params.at("groups").i;
-
-        op->params["0"] = weight.shape[1] * groups;
-        op->params["1"] = weight.shape[4];
-        op->params["11"] = weight.shape[3];
-        op->params["21"] = weight.shape[2];
-        op->params["2"] = captured_params.at("dilation").ai[2];
-        op->params["12"] = captured_params.at("dilation").ai[1];
-        op->params["22"] = captured_params.at("dilation").ai[0];
-        op->params["3"] = captured_params.at("stride").ai[2];
-        op->params["13"] = captured_params.at("stride").ai[1];
-        op->params["23"] = captured_params.at("stride").ai[0];
-        op->params["4"] = captured_params.at("padding").ai[2];
-        op->params["14"] = captured_params.at("padding").ai[1];
-        op->params["24"] = captured_params.at("padding").ai[0];
-        op->params["18"] = captured_params.at("output_padding").ai[2];
-        op->params["19"] = captured_params.at("output_padding").ai[1];
-        op->params["20"] = captured_params.at("output_padding").ai[0];
-        op->params["5"] = 1;
-        op->params["6"] = (int)(weight.data.size() / sizeof(float));
-        op->params["7"] = groups;
-
-        // transpose group-inch/group-outch/group-kd-kh-kw to group-outch/group-inch/group-kd-kh-kw
-        const int inch = weight.shape[0];
-        const int outch = weight.shape[1] * groups;
-        const int kd = weight.shape[2];
-        const int kh = weight.shape[3];
-        const int kw = weight.shape[4];
-        std::vector<float> new_weight;
-        {
-            const float* w = (const float*)weight.data.data();
-
-            new_weight.resize(outch / groups * inch * kd * kh * kw);
-            float* w2 = (float*)new_weight.data();
-            const int outch_g = outch / groups;
-            const int inch_g = inch / groups;
-            const int maxk = kd * kh * kw;
-
-            for (int g = 0; g < groups; g++)
-            {
-                // reorder weight from inch-outch to outch-inch
-                float* wg2 = w2 + g * outch_g * inch_g * maxk;
-                const float* wg = w + g * inch_g * outch_g * maxk;
-                for (int i = 0; i < outch_g; i++)
-                {
-                    for (int j = 0; j < inch_g; j++)
-                    {
-                        for (int k = 0; k < maxk; k++)
-                        {
-                            wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
-                        }
-                    }
-                }
-            }
-        }
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = Attribute({outch / groups, inch, kd, kh, kw}, new_weight);
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d_3, 21)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_fold.cpp b/tools/pnnx/src/pass_ncnn/F_fold.cpp
new file mode 100644
index 00000000000..1d35a72eb11
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/F_fold.cpp
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class F_fold : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.fold                  op_0        1 1 input out output_size=%output_size kernel_size=%kernel_size dilation=%dilation stride=%stride padding=%padding
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Fold";
+    }
+
+    const char* name_str() const
+    {
+        return "fold";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["1"] = captured_params.at("kernel_size").ai[1];
+        op->params["11"] = captured_params.at("kernel_size").ai[0];
+        op->params["2"] = captured_params.at("dilation").ai[1];
+        op->params["12"] = captured_params.at("dilation").ai[0];
+        op->params["3"] = captured_params.at("stride").ai[1];
+        op->params["13"] = captured_params.at("stride").ai[0];
+        op->params["4"] = captured_params.at("padding").ai[1];
+        op->params["14"] = captured_params.at("padding").ai[0];
+        op->params["20"] = captured_params.at("output_size").ai[1];
+        op->params["21"] = captured_params.at("output_size").ai[0];
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_fold, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_glu.cpp b/tools/pnnx/src/pass_ncnn/F_glu.cpp
index cb9397dc15e..3baf63ce52a 100644
--- a/tools/pnnx/src/pass_ncnn/F_glu.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_glu.cpp
@@ -1,16 +1,17 @@
-// Copyright (c) 2022 Xiaomi Corp.        (author: Fangjun Kuang)
+// Tencent is pleased to support the open source community by making ncnn available.
 //
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except in compliance with the License. You may obtain a copy of the
-// License at
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//               2022 Xiaomi Corp.     (author: Fangjun Kuang)
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License.
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
 
 #include "pass_ncnn.h"
 
@@ -18,30 +19,55 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_glu : public GraphRewriterPass {
- public:
-  const char *match_pattern_graph() const {
-    return R"PNNXIR(7767517
+class F_glu : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
 3 2
 pnnx.Input           input          0 1 input
 F.glu                op_0           1 1 input out dim=%dim
 pnnx.Output          output         1 0 out
 )PNNXIR";
-  }
+    }
+
+    const char* type_str() const
+    {
+        return "GLU";
+    }
+
+    const char* name_str() const
+    {
+        return "glu";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const int batch_index = op->inputs[0]->params["__batch_index"].i;
+
+        int axis = captured_params.at("dim").i;
+        if (axis == batch_index)
+        {
+            fprintf(stderr, "glu along batch axis %d is not supported\n", batch_index);
+            return;
+        }
 
-  const char *type_str() const { return "GLU"; }
+        if (axis < 0)
+        {
+            int input_rank = op->inputs[0]->shape.size();
+            axis = input_rank + axis;
+        }
 
-  const char *name_str() const { return "glu"; }
-  void write(Operator *op,
-             const std::map<std::string, Parameter> &captured_params) const {
-    int axis = captured_params.at("dim").i;
+        if (axis > batch_index)
+            axis -= 1;
 
-    op->params["0"] = axis;
-  }
+        op->params["0"] = axis;
+    }
 };
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_glu, 20)
 
-}  // namespace ncnn
+} // namespace ncnn
 
-}  // namespace pnnx
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp
new file mode 100644
index 00000000000..41dfc65ee39
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp
@@ -0,0 +1,71 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class F_grid_sample : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0       0 1 input0
+pnnx.Input              input_1       0 1 input1
+F.grid_sample           op_0          2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners
+pnnx.Output             output        1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "GridSample";
+    }
+
+    const char* name_str() const
+    {
+        return "gridsample";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::string& mode = captured_params.at("mode").s;
+        if (mode == "bilinear")
+            op->params["0"] = 1;
+        if (mode == "nearest")
+            op->params["0"] = 2;
+        if (mode == "bicubic")
+            op->params["0"] = 3;
+
+        const std::string& padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "zeros")
+            op->params["1"] = 1;
+        if (padding_mode == "border")
+            op->params["1"] = 2;
+        if (padding_mode == "reflection")
+            op->params["1"] = 3;
+
+        op->params["2"] = captured_params.at("align_corners").b ? 1 : 0;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_group_norm.cpp b/tools/pnnx/src/pass_ncnn/F_group_norm.cpp
index 7aecbf23855..0af5d32c556 100644
--- a/tools/pnnx/src/pass_ncnn/F_group_norm.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_group_norm.cpp
@@ -60,55 +60,6 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_group_norm, 20)
 
-class F_group_norm_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.group_norm            op_0        3 1 input weight bias out num_groups=%num_groups eps=%eps
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "GroupNorm";
-    }
-
-    const char* name_str() const
-    {
-        return "gn";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = captured_params.at("num_groups");
-        op->params["1"] = weight.shape[0];
-        op->params["2"] = captured_params.at("eps");
-        op->params["3"] = 1;
-
-        op->attrs["0"] = weight;
-        op->attrs["1"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_group_norm_1, 20)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp b/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp
index 4ae1c5061c9..74ec974fb3c 100644
--- a/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp
@@ -58,61 +58,6 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_layer_norm, 20)
 
-class F_layer_norm_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.layer_norm            op_0        3 1 input weight bias out normalized_shape=%normalized_shape eps=%eps
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "LayerNorm";
-    }
-
-    const char* name_str() const
-    {
-        return "ln";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        const std::vector<int>& normalized_shape = captured_params.at("normalized_shape").ai;
-        int affine_size = normalized_shape[0];
-        for (size_t i = 1; i < normalized_shape.size(); i++)
-        {
-            affine_size *= normalized_shape[i];
-        }
-
-        op->params["0"] = affine_size;
-        op->params["1"] = captured_params.at("eps");
-        op->params["2"] = 1;
-
-        op->attrs["0"] = weight;
-        op->attrs["1"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_layer_norm_1, 20)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_linear.cpp b/tools/pnnx/src/pass_ncnn/F_linear.cpp
index b76c444e4b6..890f36cc92a 100644
--- a/tools/pnnx/src/pass_ncnn/F_linear.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_linear.cpp
@@ -18,101 +18,6 @@ namespace pnnx {
 
 namespace ncnn {
 
-class F_linear : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-F.linear                op_0        2 1 input weight out bias=None
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "InnerProduct";
-    }
-
-    const char* name_str() const
-    {
-        return "linear";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = 0;
-        op->params["2"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_linear, 20)
-
-class F_linear_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Attribute          op_weight   0 1 weight @qwq
-pnnx.Attribute          op_bias     0 1 bias @qwq
-F.linear                op_0        3 1 input weight bias out
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "InnerProduct";
-    }
-
-    const char* name_str() const
-    {
-        return "linear";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/, const std::map<std::string, Attribute>& captured_attrs) const
-    {
-        Attribute weight;
-        Attribute bias;
-        for (const auto& x : captured_attrs)
-        {
-            if (x.first.substr(0, 10) == "op_weight.")
-                weight = x.second;
-            if (x.first.substr(0, 8) == "op_bias.")
-                bias = x.second;
-        }
-
-        op->params["0"] = weight.shape[0];
-        op->params["1"] = 1;
-        op->params["2"] = (int)(weight.data.size() / sizeof(float));
-
-        op->attrs["0"] = Attribute();
-        op->attrs["0"].data = {0, 0, 0, 0};
-        op->attrs["1"] = weight;
-        op->attrs["2"] = bias;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_linear_1, 20)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_normalize.cpp b/tools/pnnx/src/pass_ncnn/F_normalize.cpp
index db5e54ca47c..2030ba5675e 100644
--- a/tools/pnnx/src/pass_ncnn/F_normalize.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_normalize.cpp
@@ -45,11 +45,6 @@ pnnx.Output             output      1 0 out
     {
         const int batch_index = op->inputs[0]->params["__batch_index"].i;
 
-        int input_rank = op->inputs[0]->shape.size();
-
-        if (batch_index >= 0 && batch_index < input_rank)
-            input_rank -= 1;
-
         int axis = captured_params.at("dim").i;
         if (axis == batch_index)
         {
@@ -58,7 +53,10 @@ pnnx.Output             output      1 0 out
         }
 
         if (axis < 0)
+        {
+            int input_rank = op->inputs[0]->shape.size();
             axis = input_rank + axis;
+        }
 
         if (axis > batch_index)
             axis -= 1;
@@ -75,6 +73,11 @@ pnnx.Output             output      1 0 out
             return;
         }
 
+        int input_rank = op->inputs[0]->shape.size();
+
+        if (batch_index >= 0 && batch_index < input_rank)
+            input_rank -= 1;
+
         if (input_rank == 2 || axis != 0)
         {
             fprintf(stderr, "unsupported normalize for %d-rank tensor with axis %d\n", input_rank, axis);
diff --git a/tools/pnnx/src/pass_ncnn/F_softmax.cpp b/tools/pnnx/src/pass_ncnn/F_softmax.cpp
index 1ec110523ce..a3a23587a86 100644
--- a/tools/pnnx/src/pass_ncnn/F_softmax.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_softmax.cpp
@@ -45,11 +45,6 @@ pnnx.Output             output      1 0 out
     {
         const int batch_index = op->inputs[0]->params["__batch_index"].i;
 
-        int input_rank = op->inputs[0]->shape.size();
-
-        if (batch_index >= 0 && batch_index < input_rank)
-            input_rank -= 1;
-
         int axis = captured_params.at("dim").i;
         if (axis == batch_index)
         {
@@ -58,7 +53,10 @@ pnnx.Output             output      1 0 out
         }
 
         if (axis < 0)
+        {
+            int input_rank = op->inputs[0]->shape.size();
             axis = input_rank + axis;
+        }
 
         if (axis > batch_index)
             axis -= 1;
diff --git a/tools/pnnx/src/pass_ncnn/F_unfold.cpp b/tools/pnnx/src/pass_ncnn/F_unfold.cpp
new file mode 100644
index 00000000000..14f82b08f99
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/F_unfold.cpp
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class F_unfold : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.unfold                op_0        1 1 input out kernel_size=%kernel_size dilation=%dilation stride=%stride padding=%padding
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Unfold";
+    }
+
+    const char* name_str() const
+    {
+        return "unfold";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["1"] = captured_params.at("kernel_size").ai[1];
+        op->params["11"] = captured_params.at("kernel_size").ai[0];
+        op->params["2"] = captured_params.at("dilation").ai[1];
+        op->params["12"] = captured_params.at("dilation").ai[0];
+        op->params["3"] = captured_params.at("stride").ai[1];
+        op->params["13"] = captured_params.at("stride").ai[0];
+        op->params["4"] = captured_params.at("padding").ai[1];
+        op->params["14"] = captured_params.at("padding").ai[0];
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_unfold, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp b/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp
index 0ca99525238..ecc36506e86 100644
--- a/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp
+++ b/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp
@@ -14,6 +14,8 @@
 
 #include "pass_ncnn.h"
 
+#include <limits.h>
+
 namespace pnnx {
 
 namespace ncnn {
@@ -60,32 +62,37 @@ pnnx.Output             output      1 0 out
 
         const int batch_index = op->inputs[0]->params["__batch_index"].i;
 
-        int input_rank = op->inputs[0]->shape.size();
+        {
+            int input_rank = op->inputs[0]->shape.size();
 
-        if (batch_index >= 0 && batch_index < input_rank)
-            input_rank -= 1;
+            if (batch_index >= 0 && batch_index < input_rank)
+                input_rank -= 1;
 
-        if (input_rank > 4)
-        {
-            fprintf(stderr, "slice %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank);
-            return;
+            if (input_rank > 4)
+            {
+                fprintf(stderr, "slice %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank);
+                return;
+            }
         }
 
         for (int i = 0; i < axes_rank; i++)
         {
-            if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != -1))
+            if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != INT_MAX))
             {
                 fprintf(stderr, "slice along batch axis is not supported\n");
                 return;
             }
 
             if (axes[i] < 0)
+            {
+                int input_rank = op->inputs[0]->shape.size();
                 axes[i] = input_rank + axes[i];
+            }
 
             if (axes[i] > batch_index)
                 axes[i] -= 1;
 
-            if (ends[i] == -1)
+            if (ends[i] == INT_MAX)
                 ends[i] = -233;
         }
 
diff --git a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.cpp b/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.cpp
deleted file mode 100644
index 0d800bf8e61..00000000000
--- a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "convert_to_fp16_model.h"
-
-namespace pnnx {
-
-namespace ncnn {
-
-static unsigned short float32_to_float16(float value)
-{
-    // 1 : 8 : 23
-    union
-    {
-        unsigned int u;
-        float f;
-    } tmp;
-
-    tmp.f = value;
-
-    // 1 : 8 : 23
-    unsigned short sign = (tmp.u & 0x80000000) >> 31;
-    unsigned short exponent = (tmp.u & 0x7F800000) >> 23;
-    unsigned int significand = tmp.u & 0x7FFFFF;
-
-    //     NCNN_LOGE("%d %d %d", sign, exponent, significand);
-
-    // 1 : 5 : 10
-    unsigned short fp16;
-    if (exponent == 0)
-    {
-        // zero or denormal, always underflow
-        fp16 = (sign << 15) | (0x00 << 10) | 0x00;
-    }
-    else if (exponent == 0xFF)
-    {
-        // infinity or NaN
-        fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
-    }
-    else
-    {
-        // normalized
-        short newexp = exponent + (-127 + 15);
-        if (newexp >= 31)
-        {
-            // overflow, return infinity
-            fp16 = (sign << 15) | (0x1F << 10) | 0x00;
-        }
-        else if (newexp <= 0)
-        {
-            // Some normal fp32 cannot be expressed as normal fp16
-            fp16 = (sign << 15) | (0x00 << 10) | 0x00;
-        }
-        else
-        {
-            // normal fp16
-            fp16 = (sign << 15) | (newexp << 10) | (significand >> 13);
-        }
-    }
-
-    return fp16;
-}
-
-void convert_to_fp16_model(Graph& graph)
-{
-    for (Operator* op : graph.ops)
-    {
-        bool is_type_flag_fp32 = false;
-        for (auto& it : op->attrs)
-        {
-            Attribute& attr = it.second;
-
-            if (is_type_flag_fp32)
-            {
-                // fp32 -> fp16
-                const float* p = (const float*)attr.data.data();
-                int len = attr.data.size() / 4;
-                std::vector<char> data_fp16(len * 2);
-                unsigned short* p_fp16 = (unsigned short*)data_fp16.data();
-                for (int i = 0; i < len; i++)
-                {
-                    p_fp16[i] = float32_to_float16(p[i]);
-                }
-
-                attr.type = 3;
-                attr.data = data_fp16;
-
-                is_type_flag_fp32 = false;
-                continue;
-            }
-
-            if (attr.type == 0 && attr.data == std::vector<char> {0, 0, 0, 0})
-            {
-                // write fp16 flag
-                // unsigned int fp16_flag = 0x01306B47;
-                attr.data[0] = 0x47;
-                attr.data[1] = 0x6B;
-                attr.data[2] = 0x30;
-                attr.data[3] = 0x01;
-
-                is_type_flag_fp32 = true;
-                continue;
-            }
-        }
-    }
-}
-
-} // namespace ncnn
-
-} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
index e24764349b8..baec8795c5d 100644
--- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp
+++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
@@ -119,7 +119,23 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx
             // not supported
             return std::string();
         }
-        else if (t == "sqrt" || t == "rsqrt" || t == "neg" || t == "floor" || t == "exp")
+        else if (t == "abs"
+                 || t == "acos"
+                 || t == "asin"
+                 || t == "atan"
+                 || t == "ceil"
+                 || t == "cos"
+                 || t == "exp"
+                 || t == "floor"
+                 || t == "log"
+                 || t == "neg"
+                 || t == "reciprocal"
+                 || t == "rsqrt"
+                 || t == "sin"
+                 || t == "sqrt"
+                 || t == "square"
+                 || t == "tan"
+                 || t == "tanh")
         {
             std::string a = exprstack.top();
             exprstack.pop();
@@ -129,11 +145,23 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx
 
             Operator* op_unary = graph.new_operator_before("UnaryOp", t + "_" + std::to_string(pnnx_expr_index++), op);
 
-            if (t == "sqrt") op_unary->params["0"] = 5;
-            if (t == "rsqrt") op_unary->params["0"] = 6;
-            if (t == "neg") op_unary->params["0"] = 1;
-            if (t == "floor") op_unary->params["0"] = 2;
+            if (t == "abs") op_unary->params["0"] = 0;
+            if (t == "acos") op_unary->params["0"] = 13;
+            if (t == "asin") op_unary->params["0"] = 12;
+            if (t == "atan") op_unary->params["0"] = 14;
+            if (t == "ceil") op_unary->params["0"] = 3;
+            if (t == "cos") op_unary->params["0"] = 10;
             if (t == "exp") op_unary->params["0"] = 7;
+            if (t == "floor") op_unary->params["0"] = 2;
+            if (t == "log") op_unary->params["0"] = 8;
+            if (t == "neg") op_unary->params["0"] = 1;
+            if (t == "reciprocal") op_unary->params["0"] = 15;
+            if (t == "rsqrt") op_unary->params["0"] = 6;
+            if (t == "sin") op_unary->params["0"] = 9;
+            if (t == "sqrt") op_unary->params["0"] = 5;
+            if (t == "square") op_unary->params["0"] = 4;
+            if (t == "tan") op_unary->params["0"] = 11;
+            if (t == "tanh") op_unary->params["0"] = 16;
 
             Operand* op_unary_in = token_is_argument(a) ? op->inputs[std::stoi(a.substr(1))] : graph.get_operand(op->name + "_" + a);
             op_unary_in->consumers.push_back(op_unary);
diff --git a/tools/pnnx/src/pass_ncnn/nn_Fold.cpp b/tools/pnnx/src/pass_ncnn/nn_Fold.cpp
new file mode 100644
index 00000000000..d94bc68b030
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/nn_Fold.cpp
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class nn_Fold : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Fold                 op_0        1 1 input out output_size=%output_size kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Fold";
+    }
+
+    const char* name_str() const
+    {
+        return "fold";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["1"] = captured_params.at("kernel_size").ai[1];
+        op->params["11"] = captured_params.at("kernel_size").ai[0];
+        op->params["2"] = captured_params.at("dilation").ai[1];
+        op->params["12"] = captured_params.at("dilation").ai[0];
+        op->params["3"] = captured_params.at("stride").ai[1];
+        op->params["13"] = captured_params.at("stride").ai[0];
+        op->params["4"] = captured_params.at("padding").ai[1];
+        op->params["14"] = captured_params.at("padding").ai[0];
+        op->params["20"] = captured_params.at("output_size").ai[1];
+        op->params["21"] = captured_params.at("output_size").ai[0];
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Fold, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/nn_GLU.cpp b/tools/pnnx/src/pass_ncnn/nn_GLU.cpp
new file mode 100644
index 00000000000..82e3f84942c
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/nn_GLU.cpp
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//               2022 Xiaomi Corp.     (author: Fangjun Kuang)
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class nn_GLU : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input           input          0 1 input
+nn.GLU               op_0           1 1 input out dim=%dim
+pnnx.Output          output         1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "GLU";
+    }
+
+    const char* name_str() const
+    {
+        return "glu";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const int batch_index = op->inputs[0]->params["__batch_index"].i;
+
+        int axis = captured_params.at("dim").i;
+        if (axis == batch_index)
+        {
+            fprintf(stderr, "glu along batch axis %d is not supported\n", batch_index);
+            return;
+        }
+
+        if (axis < 0)
+        {
+            int input_rank = op->inputs[0]->shape.size();
+            axis = input_rank + axis;
+        }
+
+        if (axis > batch_index)
+            axis -= 1;
+
+        op->params["0"] = axis;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_GLU, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp b/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp
index c8cfbe4e33b..1a151168093 100644
--- a/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp
+++ b/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp
@@ -27,7 +27,7 @@ class nn_LSTM : public GraphRewriterPass
         return R"PNNXIR(7767517
 3 4
 pnnx.Input              input       0 1 input
-nn.LSTM                 op_0        1 3 input out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse
+nn.LSTM                 op_0        1 3 input out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse
 pnnx.Output             output      3 0 out out_hidden out_cell
 )PNNXIR";
     }
@@ -46,23 +46,19 @@ pnnx.Output             output      3 0 out out_hidden out_cell
     {
         const bool bidirectional = captured_params.at("bidirectional").b;
         const int num_directions = bidirectional ? 2 : 1;
-        const int num_output = captured_params.at("hidden_size").i;
+        const int hidden_size = captured_params.at("hidden_size").i;
         const int input_size = captured_params.at("input_size").i;
-        int proj_size = captured_params.at("proj_size").i;
-        if (captured_params.count("proj_size")) {
-          proj_size = captured_params.at("proj_size").i;
-        }
 
-        const int real_output_size = proj_size ? proj_size : num_output;
+        int proj_size = captured_params.at("proj_size").i;
+        if (proj_size == 0)
+            proj_size = hidden_size;
 
-        int weight_data_size = num_directions * num_output * input_size * 4;
+        int weight_data_size = num_directions * hidden_size * input_size * 4;
 
-        op->params["0"] = num_output;
+        op->params["0"] = proj_size;
         op->params["1"] = weight_data_size;
         op->params["2"] = bidirectional ? 2 : 0;
-        if (proj_size) {
-          op->params["3"] = proj_size;
-        }
+        op->params["3"] = hidden_size;
 
         op->attrs["0"] = Attribute();
         op->attrs["0"].data = {0, 0, 0, 0};
@@ -71,7 +67,7 @@ pnnx.Output             output      3 0 out out_hidden out_cell
         {
             std::vector<float> new_weight_ih;
             {
-                const int weight_data_size_g = num_output * input_size;
+                const int weight_data_size_g = hidden_size * input_size;
 
                 const float* weight_ih = (const float*)captured_attrs.at("op_0.weight_ih_l0").data.data();
                 const float* iptr = weight_ih;
@@ -79,7 +75,7 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                 const float* gptr = weight_ih + weight_data_size_g * 2;
                 const float* optr = weight_ih + weight_data_size_g * 3;
 
-                new_weight_ih.resize(4 * num_output * input_size);
+                new_weight_ih.resize(4 * hidden_size * input_size);
                 float* weight = (float*)new_weight_ih.data();
                 float* w_iptr = weight;
                 float* w_fptr = weight + weight_data_size_g;
@@ -95,7 +91,7 @@ pnnx.Output             output      3 0 out out_hidden out_cell
             {
                 std::vector<float> new_weight_ih_reverse;
                 {
-                    const int weight_data_size_g = num_output * input_size;
+                    const int weight_data_size_g = hidden_size * input_size;
 
                     const float* weight_ih = (const float*)captured_attrs.at("op_0.weight_ih_l0_reverse").data.data();
                     const float* iptr = weight_ih;
@@ -103,7 +99,7 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                     const float* gptr = weight_ih + weight_data_size_g * 2;
                     const float* optr = weight_ih + weight_data_size_g * 3;
 
-                    new_weight_ih_reverse.resize(4 * num_output * input_size);
+                    new_weight_ih_reverse.resize(4 * hidden_size * input_size);
                     float* weight = (float*)new_weight_ih_reverse.data();
                     float* w_iptr = weight;
                     float* w_fptr = weight + weight_data_size_g;
@@ -114,11 +110,11 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                     memcpy(w_optr, optr, weight_data_size_g * sizeof(float));
                     memcpy(w_gptr, gptr, weight_data_size_g * sizeof(float));
                 }
-                op->attrs["1"] = Attribute({4, num_output, input_size}, new_weight_ih) + Attribute({4, num_output, input_size}, new_weight_ih_reverse);
+                op->attrs["1"] = Attribute({4, hidden_size, input_size}, new_weight_ih) + Attribute({4, hidden_size, input_size}, new_weight_ih_reverse);
             }
             else
             {
-                op->attrs["1"] = Attribute({4, num_output, input_size}, new_weight_ih);
+                op->attrs["1"] = Attribute({4, hidden_size, input_size}, new_weight_ih);
             }
         }
 
@@ -133,33 +129,33 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                 const float* bias_ih = (const float*)captured_attrs.at("op_0.bias_ih_l0").data.data();
                 const float* bias_hh = (const float*)captured_attrs.at("op_0.bias_hh_l0").data.data();
                 const float* bias_ih_iptr = bias_ih;
-                const float* bias_ih_fptr = bias_ih + num_output;
-                const float* bias_ih_gptr = bias_ih + num_output * 2;
-                const float* bias_ih_optr = bias_ih + num_output * 3;
+                const float* bias_ih_fptr = bias_ih + hidden_size;
+                const float* bias_ih_gptr = bias_ih + hidden_size * 2;
+                const float* bias_ih_optr = bias_ih + hidden_size * 3;
                 const float* bias_hh_iptr = bias_hh;
-                const float* bias_hh_fptr = bias_hh + num_output;
-                const float* bias_hh_gptr = bias_hh + num_output * 2;
-                const float* bias_hh_optr = bias_hh + num_output * 3;
+                const float* bias_hh_fptr = bias_hh + hidden_size;
+                const float* bias_hh_gptr = bias_hh + hidden_size * 2;
+                const float* bias_hh_optr = bias_hh + hidden_size * 3;
 
-                new_bias.resize(4 * num_output);
+                new_bias.resize(4 * hidden_size);
                 float* bias = (float*)new_bias.data();
                 float* b_iptr = bias;
-                float* b_fptr = bias + num_output;
-                float* b_optr = bias + num_output * 2;
-                float* b_gptr = bias + num_output * 3;
-                for (int i = 0; i < num_output; i++)
+                float* b_fptr = bias + hidden_size;
+                float* b_optr = bias + hidden_size * 2;
+                float* b_gptr = bias + hidden_size * 3;
+                for (int i = 0; i < hidden_size; i++)
                 {
                     b_iptr[i] = bias_ih_iptr[i] + bias_hh_iptr[i];
                 }
-                for (int i = 0; i < num_output; i++)
+                for (int i = 0; i < hidden_size; i++)
                 {
                     b_fptr[i] = bias_ih_fptr[i] + bias_hh_fptr[i];
                 }
-                for (int i = 0; i < num_output; i++)
+                for (int i = 0; i < hidden_size; i++)
                 {
                     b_optr[i] = bias_ih_optr[i] + bias_hh_optr[i];
                 }
-                for (int i = 0; i < num_output; i++)
+                for (int i = 0; i < hidden_size; i++)
                 {
                     b_gptr[i] = bias_ih_gptr[i] + bias_hh_gptr[i];
                 }
@@ -172,63 +168,63 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                     const float* bias_ih = (const float*)captured_attrs.at("op_0.bias_ih_l0_reverse").data.data();
                     const float* bias_hh = (const float*)captured_attrs.at("op_0.bias_hh_l0_reverse").data.data();
                     const float* bias_ih_iptr = bias_ih;
-                    const float* bias_ih_fptr = bias_ih + num_output;
-                    const float* bias_ih_gptr = bias_ih + num_output * 2;
-                    const float* bias_ih_optr = bias_ih + num_output * 3;
+                    const float* bias_ih_fptr = bias_ih + hidden_size;
+                    const float* bias_ih_gptr = bias_ih + hidden_size * 2;
+                    const float* bias_ih_optr = bias_ih + hidden_size * 3;
                     const float* bias_hh_iptr = bias_hh;
-                    const float* bias_hh_fptr = bias_hh + num_output;
-                    const float* bias_hh_gptr = bias_hh + num_output * 2;
-                    const float* bias_hh_optr = bias_hh + num_output * 3;
+                    const float* bias_hh_fptr = bias_hh + hidden_size;
+                    const float* bias_hh_gptr = bias_hh + hidden_size * 2;
+                    const float* bias_hh_optr = bias_hh + hidden_size * 3;
 
-                    new_bias_reverse.resize(4 * num_output);
+                    new_bias_reverse.resize(4 * hidden_size);
                     float* bias = (float*)new_bias_reverse.data();
                     float* b_iptr = bias;
-                    float* b_fptr = bias + num_output;
-                    float* b_optr = bias + num_output * 2;
-                    float* b_gptr = bias + num_output * 3;
-                    for (int i = 0; i < num_output; i++)
+                    float* b_fptr = bias + hidden_size;
+                    float* b_optr = bias + hidden_size * 2;
+                    float* b_gptr = bias + hidden_size * 3;
+                    for (int i = 0; i < hidden_size; i++)
                     {
                         b_iptr[i] = bias_ih_iptr[i] + bias_hh_iptr[i];
                     }
-                    for (int i = 0; i < num_output; i++)
+                    for (int i = 0; i < hidden_size; i++)
                     {
                         b_fptr[i] = bias_ih_fptr[i] + bias_hh_fptr[i];
                     }
-                    for (int i = 0; i < num_output; i++)
+                    for (int i = 0; i < hidden_size; i++)
                     {
                         b_optr[i] = bias_ih_optr[i] + bias_hh_optr[i];
                     }
-                    for (int i = 0; i < num_output; i++)
+                    for (int i = 0; i < hidden_size; i++)
                     {
                         b_gptr[i] = bias_ih_gptr[i] + bias_hh_gptr[i];
                     }
                 }
 
-                op->attrs["3"] = Attribute({4, num_output}, new_bias) + Attribute({4, num_output}, new_bias_reverse);
+                op->attrs["3"] = Attribute({4, hidden_size}, new_bias) + Attribute({4, hidden_size}, new_bias_reverse);
             }
             else
             {
-                op->attrs["3"] = Attribute({4, num_output}, new_bias);
+                op->attrs["3"] = Attribute({4, hidden_size}, new_bias);
             }
         }
         else
         {
-            std::vector<float> bias(4 * num_output, 0.f);
+            std::vector<float> bias(4 * hidden_size, 0.f);
 
             if (bidirectional)
-                op->attrs["3"] = Attribute({4, num_output}, bias) + Attribute({4, num_output}, bias);
+                op->attrs["3"] = Attribute({4, hidden_size}, bias) + Attribute({4, hidden_size}, bias);
             else
-                op->attrs["3"] = Attribute({4, num_output}, bias);
+                op->attrs["3"] = Attribute({4, hidden_size}, bias);
         }
 
         op->attrs["4"] = Attribute();
         op->attrs["4"].data = {0, 0, 0, 0};
 
-        // reorder IFGO-hidden-hidden to IFOG-hidden-hidden
+        // reorder IFGO-hidden-proj to IFOG-hidden-proj
         {
             std::vector<float> new_weight_hh;
             {
-                const int weight_data_size_g = num_output * real_output_size;
+                const int weight_data_size_g = hidden_size * proj_size;
 
                 const float* weight_hh = (const float*)captured_attrs.at("op_0.weight_hh_l0").data.data();
                 const float* iptr = weight_hh;
@@ -236,7 +232,7 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                 const float* gptr = weight_hh + weight_data_size_g * 2;
                 const float* optr = weight_hh + weight_data_size_g * 3;
 
-                new_weight_hh.resize(4 * weight_data_size_g);
+                new_weight_hh.resize(4 * hidden_size * proj_size);
                 float* weight = (float*)new_weight_hh.data();
                 float* w_iptr = weight;
                 float* w_fptr = weight + weight_data_size_g;
@@ -252,7 +248,7 @@ pnnx.Output             output      3 0 out out_hidden out_cell
             {
                 std::vector<float> new_weight_hh_reverse;
                 {
-                    const int weight_data_size_g = num_output * real_output_size;
+                    const int weight_data_size_g = hidden_size * proj_size;
 
                     const float* weight_hh = (const float*)captured_attrs.at("op_0.weight_hh_l0_reverse").data.data();
                     const float* iptr = weight_hh;
@@ -260,7 +256,7 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                     const float* gptr = weight_hh + weight_data_size_g * 2;
                     const float* optr = weight_hh + weight_data_size_g * 3;
 
-                    new_weight_hh_reverse.resize(4 * weight_data_size_g);
+                    new_weight_hh_reverse.resize(4 * hidden_size * proj_size);
                     float* weight = (float*)new_weight_hh_reverse.data();
                     float* w_iptr = weight;
                     float* w_fptr = weight + weight_data_size_g;
@@ -271,30 +267,28 @@ pnnx.Output             output      3 0 out out_hidden out_cell
                     memcpy(w_optr, optr, weight_data_size_g * sizeof(float));
                     memcpy(w_gptr, gptr, weight_data_size_g * sizeof(float));
                 }
-                op->attrs["5"] = Attribute({4, num_output, real_output_size}, new_weight_hh) + Attribute({4, num_output, real_output_size}, new_weight_hh_reverse);
+                op->attrs["5"] = Attribute({4, hidden_size, proj_size}, new_weight_hh) + Attribute({4, hidden_size, proj_size}, new_weight_hh_reverse);
             }
             else
             {
-                op->attrs["5"] = Attribute({4, num_output, real_output_size}, new_weight_hh);
+                op->attrs["5"] = Attribute({4, hidden_size, proj_size}, new_weight_hh);
             }
         }
 
-        if (proj_size) {
-          op->attrs["6"] = Attribute();
-          op->attrs["6"].data = {0, 0, 0, 0};
-          const float* weight_hr = (const float*)captured_attrs.at("op_0.weight_hr_l0").data.data();
-
-          const int weight_data_size_g = proj_size * num_output;
-          std::vector<float> new_weight_hr(weight_hr, weight_hr + weight_data_size_g);
-          op->attrs["7"] = Attribute({proj_size, num_output}, new_weight_hr);
-
-          if (bidirectional) {
-            fprintf(stderr, "Not implemented yet for bi-LSTM with proj_size > 0!\n");
-            exit(-1);
-          }
+        if (proj_size != hidden_size)
+        {
+            op->attrs["6"] = Attribute();
+            op->attrs["6"].data = {0, 0, 0, 0};
 
+            if (bidirectional)
+            {
+                op->attrs["7"] = captured_attrs.at("op_0.weight_hr_l0") + captured_attrs.at("op_0.weight_hr_l0_reverse");
+            }
+            else
+            {
+                op->attrs["7"] = captured_attrs.at("op_0.weight_hr_l0");
+            }
         }
-
     }
 };
 
@@ -310,7 +304,7 @@ class nn_LSTM_1 : public nn_LSTM
 pnnx.Input              input       0 1 input
 pnnx.Input              in_hidden   0 1 in_hidden
 pnnx.Input              in_hidden   0 1 in_cell
-nn.LSTM                 op_0        3 3 input in_hidden in_cell out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse
+nn.LSTM                 op_0        3 3 input in_hidden in_cell out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse
 pnnx.Output             output      3 0 out out_hidden out_cell
 )PNNXIR";
     }
@@ -326,7 +320,7 @@ class nn_LSTM_2 : public nn_LSTM
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.LSTM                 op_0        1 1 input out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse
+nn.LSTM                 op_0        1 1 input out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -344,7 +338,7 @@ class nn_LSTM_3 : public nn_LSTM
 pnnx.Input              input       0 1 input
 pnnx.Input              in_hidden   0 1 in_hidden
 pnnx.Input              in_hidden   0 1 in_cell
-nn.LSTM                 op_0        3 1 input in_hidden in_cell out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse
+nn.LSTM                 op_0        3 1 input in_hidden in_cell out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -352,34 +346,6 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_LSTM_3, 20)
 
-class nn_LSTM_4 : public nn_LSTM
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-pnnx.Input              in_hidden   0 1 in_hidden
-pnnx.Input              in_hidden   0 1 in_cell
-nn.LSTM                 op_0        3 3 input in_hidden in_cell out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0  @weight_hr_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse
-pnnx.Output             output      3 0 out out_hidden out_cell
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "LSTM2";
-    }
-
-    const char* name_str() const
-    {
-        return "lstm2";
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_LSTM_4, 19)
-
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp
index bcc9407f605..610b304db6c 100644
--- a/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp
+++ b/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp
@@ -29,7 +29,7 @@ class nn_MultiheadAttention : public GraphRewriterPass
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.MultiheadAttention   op_0        1 1 input out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
+nn.MultiheadAttention   op_0        1 1 input out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -55,6 +55,8 @@ pnnx.Output             output      1 0 out
         }
 
         const int embed_dim = captured_params.at("embed_dim").i;
+        const int kdim = captured_params.at("kdim").i;
+        const int vdim = captured_params.at("vdim").i;
 
         // split in_proj_weight and in_proj_bias into q k v
         std::vector<float> q_weight(embed_dim * embed_dim);
@@ -90,6 +92,8 @@ pnnx.Output             output      1 0 out
         }
 
         op->params["2"] = embed_dim * embed_dim;
+        op->params["3"] = kdim;
+        op->params["4"] = vdim;
 
         op->attrs["0"] = Attribute();
         op->attrs["0"].data = {0, 0, 0, 0};
@@ -120,7 +124,7 @@ class nn_MultiheadAttention_1 : public nn_MultiheadAttention
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-nn.MultiheadAttention   op_0        1 1 input out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
+nn.MultiheadAttention   op_0        1 1 input out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -128,6 +132,187 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_1, 20)
 
+class nn_MultiheadAttention_2 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+pnnx.Input              input_2     0 1 value
+nn.MultiheadAttention   op_0        3 1 query key value out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "MultiHeadAttention";
+    }
+
+    const char* name_str() const
+    {
+        return "attention";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        op->params["0"] = captured_params.at("embed_dim");
+        op->params["1"] = captured_params.at("num_heads");
+
+        if (captured_params.at("add_bias_kv").b)
+        {
+            fprintf(stderr, "MultiheadAttention add_bias_kv=True not supported\n");
+        }
+
+        const int embed_dim = captured_params.at("embed_dim").i;
+        const int kdim = captured_params.at("kdim").i;
+        const int vdim = captured_params.at("vdim").i;
+
+        // split in_proj_bias into q k v
+        std::vector<float> q_bias(embed_dim);
+        std::vector<float> k_bias(embed_dim);
+        std::vector<float> v_bias(embed_dim);
+        {
+            // qkv - embed_dim
+            const float* bptr = (const float*)captured_attrs.at("op_0.in_proj_bias").data.data();
+
+            {
+                memcpy(q_bias.data(), bptr, embed_dim * sizeof(float));
+                bptr += embed_dim;
+            }
+
+            {
+                memcpy(k_bias.data(), bptr, embed_dim * sizeof(float));
+                bptr += embed_dim;
+            }
+
+            {
+                memcpy(v_bias.data(), bptr, embed_dim * sizeof(float));
+            }
+        }
+
+        op->params["2"] = embed_dim * embed_dim;
+        op->params["3"] = kdim;
+        op->params["4"] = vdim;
+
+        if (captured_attrs.find("op_0.in_proj_weight") != captured_attrs.end())
+        {
+            // split in_proj_weight and in_proj_bias into q k v
+            std::vector<float> q_weight(embed_dim * embed_dim);
+            std::vector<float> k_weight(embed_dim * kdim);
+            std::vector<float> v_weight(embed_dim * vdim);
+            {
+                // qkv - embed_dim - embed_dim
+                const float* wptr = (const float*)captured_attrs.at("op_0.in_proj_weight").data.data();
+
+                {
+                    memcpy(q_weight.data(), wptr, embed_dim * embed_dim * sizeof(float));
+                    wptr += embed_dim * embed_dim;
+                }
+
+                {
+                    memcpy(k_weight.data(), wptr, embed_dim * kdim * sizeof(float));
+                    wptr += embed_dim * kdim;
+                }
+
+                {
+                    memcpy(v_weight.data(), wptr, embed_dim * vdim * sizeof(float));
+                }
+            }
+
+            op->attrs["0"] = Attribute();
+            op->attrs["0"].data = {0, 0, 0, 0};
+            op->attrs["1"] = Attribute({embed_dim, embed_dim}, q_weight);
+            op->attrs["2"] = Attribute({embed_dim}, q_bias);
+            op->attrs["3"] = Attribute();
+            op->attrs["3"].data = {0, 0, 0, 0};
+            op->attrs["4"] = Attribute({embed_dim, kdim}, k_weight);
+            op->attrs["5"] = Attribute({embed_dim}, k_bias);
+            op->attrs["6"] = Attribute();
+            op->attrs["6"].data = {0, 0, 0, 0};
+            op->attrs["7"] = Attribute({embed_dim, vdim}, v_weight);
+            op->attrs["8"] = Attribute({embed_dim}, v_bias);
+        }
+        else
+        {
+            op->attrs["0"] = Attribute();
+            op->attrs["0"].data = {0, 0, 0, 0};
+            op->attrs["1"] = captured_attrs.at("op_0.q_proj_weight");
+            op->attrs["2"] = Attribute({embed_dim}, q_bias);
+            op->attrs["3"] = Attribute();
+            op->attrs["3"].data = {0, 0, 0, 0};
+            op->attrs["4"] = captured_attrs.at("op_0.k_proj_weight");
+            op->attrs["5"] = Attribute({embed_dim}, k_bias);
+            op->attrs["6"] = Attribute();
+            op->attrs["6"].data = {0, 0, 0, 0};
+            op->attrs["7"] = captured_attrs.at("op_0.v_proj_weight");
+            op->attrs["8"] = Attribute({embed_dim}, v_bias);
+        }
+
+        op->attrs["9"] = Attribute();
+        op->attrs["9"].data = {0, 0, 0, 0};
+        op->attrs["a"] = captured_attrs.at("op_0.out_proj.weight");
+        op->attrs["b"] = captured_attrs.at("op_0.out_proj.bias");
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_2, 20)
+
+class nn_MultiheadAttention_3 : public nn_MultiheadAttention_2
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+pnnx.Input              input_2     0 1 value
+nn.MultiheadAttention   op_0        3 1 query key value out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_3, 20)
+
+class nn_MultiheadAttention_4 : public nn_MultiheadAttention_2
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+nn.MultiheadAttention   op_0        2 1 query key out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_4, 20)
+
+class nn_MultiheadAttention_5 : public nn_MultiheadAttention_2
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+nn.MultiheadAttention   op_0        2 1 query key out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_5, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp b/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp
index f263e357207..d9629e6eff8 100644
--- a/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp
+++ b/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp
@@ -45,11 +45,6 @@ pnnx.Output             output      1 0 out
     {
         const int batch_index = op->inputs[0]->params["__batch_index"].i;
 
-        int input_rank = op->inputs[0]->shape.size();
-
-        if (batch_index >= 0 && batch_index < input_rank)
-            input_rank -= 1;
-
         int axis = captured_params.at("dim").i;
         if (axis == batch_index)
         {
@@ -58,7 +53,10 @@ pnnx.Output             output      1 0 out
         }
 
         if (axis < 0)
+        {
+            int input_rank = op->inputs[0]->shape.size();
             axis = input_rank + axis;
+        }
 
         if (axis > batch_index)
             axis -= 1;
diff --git a/tools/pnnx/src/pass_ncnn/nn_Softmax2d.cpp b/tools/pnnx/src/pass_ncnn/nn_Softmax2d.cpp
new file mode 100644
index 00000000000..152eb3a6a37
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/nn_Softmax2d.cpp
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class nn_Softmax2d : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Softmax2d            op_0        1 1 input out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Softmax";
+    }
+
+    const char* name_str() const
+    {
+        return "softmax2d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/) const
+    {
+        op->params["0"] = 0;
+        op->params["1"] = 1;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Softmax2d, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/nn_Unfold.cpp b/tools/pnnx/src/pass_ncnn/nn_Unfold.cpp
new file mode 100644
index 00000000000..526e5d24c38
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/nn_Unfold.cpp
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class nn_Unfold : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Unfold               op_0        1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Unfold";
+    }
+
+    const char* name_str() const
+    {
+        return "unfold";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["1"] = captured_params.at("kernel_size").ai[1];
+        op->params["11"] = captured_params.at("kernel_size").ai[0];
+        op->params["2"] = captured_params.at("dilation").ai[1];
+        op->params["12"] = captured_params.at("dilation").ai[0];
+        op->params["3"] = captured_params.at("stride").ai[1];
+        op->params["13"] = captured_params.at("stride").ai[0];
+        op->params["4"] = captured_params.at("padding").ai[1];
+        op->params["14"] = captured_params.at("padding").ai[0];
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Unfold, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
index 13049e5f05b..73e8e08eb39 100644
--- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
+++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
@@ -40,6 +40,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.conv1d",
         "F.conv2d",
         "F.conv3d",
+        "F.fold",
         "F.grid_sample",
         "F.group_norm",
         "F.instance_norm",
@@ -54,6 +55,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.pixel_shuffle",
         "F.pixel_unshuffle",
         "F.prelu",
+        "F.unfold",
         "F.upsample_bilinear",
         "F.upsample_nearest",
         "F.upsample",
@@ -80,6 +82,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "nn.ConvTranspose1d",
         "nn.ConvTranspose2d",
         "nn.ConvTranspose3d",
+        "nn.Fold",
         "nn.GroupNorm",
         "nn.InstanceNorm1d",
         "nn.InstanceNorm2d",
@@ -98,6 +101,8 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "nn.ReplicationPad1d",
         "nn.ReplicationPad2d",
         "nn.ReplicationPad3d",
+        "nn.Softmax2d",
+        "nn.Unfold",
         "nn.Upsample",
         "nn.UpsamplingBilinear2d",
         "nn.UpsamplingNearest2d",
@@ -283,6 +288,11 @@ void solve_batch_index(Graph& graph)
     {
         if (is_known_operator_with_batch_index_0(op))
         {
+            if (op->type == std::string("F.grid_sample"))
+            {
+                op->inputs[1]->params["__batch_index"] = 0;
+            }
+
             op->inputs[0]->params["__batch_index"] = 0;
             op->outputs[0]->params["__batch_index"] = 0;
         }
diff --git a/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp b/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp
index 1b475bbd755..a1a52d272a7 100644
--- a/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp
+++ b/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp
@@ -54,7 +54,7 @@ pnnx.Output             output      1 0 out
 
         int input_rank = op->inputs[0]->shape.size();
 
-        if (input_rank > 4)
+        if (input_rank > 5)
         {
             fprintf(stderr, "squeeze %d-rank tensor is not supported yet!\n", input_rank);
             return;
@@ -97,6 +97,7 @@ pnnx.Output             output      1 0 out
     {
         op->params["0"] = 1;
         op->params["1"] = 1;
+        op->params["11"] = 1;
         op->params["2"] = 1;
     }
 };
diff --git a/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp b/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp
index 3dc2084d8f1..3c8dc24d18d 100644
--- a/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp
+++ b/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp
@@ -54,7 +54,7 @@ pnnx.Output             output      1 0 out
 
         int input_rank = op->inputs[0]->shape.size();
 
-        if (input_rank > 3)
+        if (input_rank > 4)
         {
             fprintf(stderr, "unsqueeze %d-rank tensor is not supported yet!\n", input_rank);
             return;
diff --git a/tools/pnnx/src/save_ncnn.cpp b/tools/pnnx/src/save_ncnn.cpp
new file mode 100644
index 00000000000..6a4407879df
--- /dev/null
+++ b/tools/pnnx/src/save_ncnn.cpp
@@ -0,0 +1,456 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "save_ncnn.h"
+
+namespace pnnx {
+
+static bool type_is_integer(int type)
+{
+    if (type == 1) return false;
+    if (type == 2) return false;
+    if (type == 3) return false;
+    if (type == 4) return true;
+    if (type == 5) return true;
+    if (type == 6) return true;
+    if (type == 7) return true;
+    if (type == 8) return true;
+    if (type == 9) return true;
+    if (type == 10) return false;
+    if (type == 11) return false;
+    if (type == 12) return false;
+    return false;
+}
+
+static const char* type_to_dtype_string(int type)
+{
+    if (type == 1) return "torch.float";
+    if (type == 2) return "torch.double";
+    if (type == 3) return "torch.half";
+    if (type == 4) return "torch.int";
+    if (type == 5) return "torch.long";
+    if (type == 6) return "torch.short";
+    if (type == 7) return "torch.int8";
+    if (type == 8) return "torch.uint8";
+    if (type == 9) return "torch.bool";
+    if (type == 10) return "torch.complex64";
+    if (type == 11) return "torch.complex128";
+    if (type == 12) return "torch.complex32";
+    return "null";
+}
+
+static bool string_is_positive_integer(const std::string& t)
+{
+    for (size_t i = 0; i < t.size(); i++)
+    {
+        if (t[i] < '0' || t[i] > '9')
+            return false;
+    }
+
+    return true;
+}
+
+static unsigned short float32_to_float16(float value)
+{
+    // 1 : 8 : 23
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+
+    tmp.f = value;
+
+    // 1 : 8 : 23
+    unsigned short sign = (tmp.u & 0x80000000) >> 31;
+    unsigned short exponent = (tmp.u & 0x7F800000) >> 23;
+    unsigned int significand = tmp.u & 0x7FFFFF;
+
+    //     NCNN_LOGE("%d %d %d", sign, exponent, significand);
+
+    // 1 : 5 : 10
+    unsigned short fp16;
+    if (exponent == 0)
+    {
+        // zero or denormal, always underflow
+        fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+    }
+    else if (exponent == 0xFF)
+    {
+        // infinity or NaN
+        fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
+    }
+    else
+    {
+        // normalized
+        short newexp = exponent + (-127 + 15);
+        if (newexp >= 31)
+        {
+            // overflow, return infinity
+            fp16 = (sign << 15) | (0x1F << 10) | 0x00;
+        }
+        else if (newexp <= 0)
+        {
+            // Some normal fp32 cannot be expressed as normal fp16
+            fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+        }
+        else
+        {
+            // normal fp16
+            fp16 = (sign << 15) | (newexp << 10) | (significand >> 13);
+        }
+    }
+
+    return fp16;
+}
+
+static size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+int save_ncnn(const Graph& g, const std::string& parampath, const std::string& binpath, const std::string& pypath, int fp16)
+{
+    FILE* paramfp = fopen(parampath.c_str(), "wb");
+    if (!paramfp)
+    {
+        fprintf(stderr, "fopen %s failed\n", parampath.c_str());
+        return -1;
+    }
+
+    FILE* binfp = fopen(binpath.c_str(), "wb");
+    if (!binfp)
+    {
+        fprintf(stderr, "fopen %s failed\n", binpath.c_str());
+        fclose(paramfp);
+        return -1;
+    }
+
+    // magic
+    fprintf(paramfp, "7767517\n");
+
+    // op count and oprand count
+    fprintf(paramfp, "%d %d\n", (int)g.ops.size(), (int)g.operands.size());
+
+    for (const Operator* op : g.ops)
+    {
+        fprintf(paramfp, "%-24s %-24s %d %d", op->type.c_str(), op->name.c_str(), (int)op->inputs.size(), (int)op->outputs.size());
+
+        for (const Operand* oprand : op->inputs)
+        {
+            fprintf(paramfp, " %s", oprand->name.c_str());
+        }
+
+        for (const Operand* oprand : op->outputs)
+        {
+            fprintf(paramfp, " %s", oprand->name.c_str());
+        }
+
+        for (const auto& it : op->params)
+        {
+            const Parameter& param = it.second;
+
+            if (!string_is_positive_integer(it.first))
+            {
+                fprintf(stderr, "ignore %s %s param %s=", op->type.c_str(), op->name.c_str(), it.first.c_str());
+
+                if (param.type == 0)
+                {
+                    fprintf(stderr, "None");
+                }
+                if (param.type == 1)
+                {
+                    if (param.b)
+                        fprintf(stderr, "True");
+                    else
+                        fprintf(stderr, "False");
+                }
+                if (param.type == 2)
+                {
+                    fprintf(stderr, "%d", param.i);
+                }
+                if (param.type == 3)
+                {
+                    fprintf(stderr, "%e", param.f);
+                }
+                if (param.type == 4)
+                {
+                    fprintf(stderr, "%s", param.s.c_str());
+                }
+                if (param.type == 5)
+                {
+                    fprintf(stderr, "(");
+                    for (size_t i = 0; i < param.ai.size(); i++)
+                    {
+                        fprintf(stderr, "%d", param.ai[i]);
+                        if (i + 1 != param.ai.size())
+                            fprintf(stderr, ",");
+                    }
+                    fprintf(stderr, ")");
+                }
+                if (param.type == 6)
+                {
+                    fprintf(stderr, "(");
+                    for (size_t i = 0; i < param.af.size(); i++)
+                    {
+                        fprintf(stderr, "%e", param.af[i]);
+                        if (i + 1 != param.af.size())
+                            fprintf(stderr, ",");
+                    }
+                    fprintf(stderr, ")");
+                }
+                if (param.type == 7)
+                {
+                    fprintf(stderr, "(");
+                    for (size_t i = 0; i < param.as.size(); i++)
+                    {
+                        fprintf(stderr, "%s", param.as[i].c_str());
+                        if (i + 1 != param.as.size())
+                            fprintf(stderr, ",");
+                    }
+                    fprintf(stderr, ")");
+                }
+                fprintf(stderr, "\n");
+
+                continue;
+            }
+
+            const int idkey = std::stoi(it.first);
+            if (param.type == 2)
+            {
+                fprintf(paramfp, " %d=%d", idkey, param.i);
+            }
+            if (param.type == 3)
+            {
+                fprintf(paramfp, " %d=%e", idkey, param.f);
+            }
+            if (param.type == 5)
+            {
+                const int array_size = (int)param.ai.size();
+                fprintf(paramfp, " %d=%d", -23300 - idkey, array_size);
+                for (size_t i = 0; i < param.ai.size(); i++)
+                {
+                    fprintf(paramfp, ",%d", param.ai[i]);
+                }
+            }
+            if (param.type == 6)
+            {
+                const int array_size = (int)param.af.size();
+                fprintf(paramfp, " %d=%d", -23300 - idkey, array_size);
+                for (size_t i = 0; i < param.af.size(); i++)
+                {
+                    fprintf(paramfp, ",%e", param.af[i]);
+                }
+            }
+        }
+
+        bool is_type_flag_fp32 = false;
+        for (const auto& it : op->attrs)
+        {
+            //             fprintf(paramfp, " @%s=", it.first.c_str());
+
+            const Attribute& attr = it.second;
+
+            if (fp16 && is_type_flag_fp32)
+            {
+                // fp32 -> fp16
+                const float* p = (const float*)attr.data.data();
+                int len = attr.data.size() / 4;
+                std::vector<char> data_fp16(alignSize(len * 2, 4));
+                unsigned short* p_fp16 = (unsigned short*)data_fp16.data();
+                for (int i = 0; i < len; i++)
+                {
+                    p_fp16[i] = float32_to_float16(p[i]);
+                }
+
+                // pad size to 4bytes
+                if (len % 2 == 1)
+                {
+                    // pad with fixed value for model hash consistency
+                    p_fp16[len] = 0x2283;
+                }
+
+                fwrite(data_fp16.data(), data_fp16.size(), 1, binfp);
+
+                is_type_flag_fp32 = false;
+                continue;
+            }
+
+            if (fp16 && attr.type == 0 && attr.data == std::vector<char> {0, 0, 0, 0})
+            {
+                // write fp16 flag
+                unsigned int fp16_flag = 0x01306B47;
+                fwrite((const char*)&fp16_flag, sizeof(fp16_flag), 1, binfp);
+
+                is_type_flag_fp32 = true;
+                continue;
+            }
+
+            fwrite(attr.data.data(), attr.data.size(), 1, binfp);
+        }
+
+        //         if (op->inputnames.size() == op->inputs.size())
+        //         {
+        //             for (size_t i = 0; i < op->inputs.size(); i++)
+        //             {
+        //                 const Operand* oprand = op->inputs[i];
+        //                 fprintf(paramfp, " $%s=%s", op->inputnames[i].c_str(), oprand->name.c_str());
+        //             }
+        //         }
+
+        //         for (const Operand* oprand : op->outputs)
+        //         {
+        //             if (oprand->params.find("__batch_index") == oprand->params.end())
+        //                 continue;
+        //
+        //             const int batch_index = oprand->params.at("__batch_index").i;
+        //
+        //             fprintf(paramfp, " #%s=%d", oprand->name.c_str(), batch_index);
+        //         }
+
+        //         for (const Operand* oprand : op->outputs)
+        //         {
+        //             if (oprand->shape.empty())
+        //                 continue;
+        //
+        //             fprintf(paramfp, " #%s=", oprand->name.c_str());
+        //
+        //             fprintf(paramfp, "(");
+        //             for (int64_t i = 0; i < oprand->shape.size() - 1; i++)
+        //             {
+        //                 fprintf(paramfp, "%d,", oprand->shape[i]);
+        //             }
+        //             if (oprand->shape.size() > 0)
+        //                 fprintf(paramfp, "%d", oprand->shape[oprand->shape.size() - 1]);
+        //             fprintf(paramfp, ")");
+        //
+        //             fprintf(paramfp, type_to_string(oprand->type));
+        //         }
+
+        fprintf(paramfp, "\n");
+    }
+
+    fclose(paramfp);
+    fclose(binfp);
+
+    FILE* pyfp = fopen(pypath.c_str(), "wb");
+    if (!pyfp)
+    {
+        fprintf(stderr, "fopen %s failed\n", pypath.c_str());
+        return -1;
+    }
+
+    fprintf(pyfp, "import numpy as np\n");
+    fprintf(pyfp, "import ncnn\n");
+    fprintf(pyfp, "import torch\n");
+
+    fprintf(pyfp, "\n");
+
+    // test inference
+    {
+        fprintf(pyfp, "def test_inference():\n");
+        fprintf(pyfp, "    torch.manual_seed(0)\n");
+
+        for (int input_index = 0;; input_index++)
+        {
+            std::string input_name = std::string("in") + std::to_string(input_index);
+            const Operand* r = g.get_operand(input_name);
+            if (!r)
+                break;
+
+            if (type_is_integer(r->type))
+            {
+                fprintf(pyfp, "    %s = torch.randint(10, (", input_name.c_str());
+                for (size_t i = 0; i < r->shape.size(); i++)
+                {
+                    fprintf(pyfp, "%d", r->shape[i]);
+                    if (i + 1 != r->shape.size() || r->shape.size() == 1)
+                        fprintf(pyfp, ", ");
+                }
+                fprintf(pyfp, "), dtype=%s)\n", type_to_dtype_string(r->type));
+            }
+            else
+            {
+                fprintf(pyfp, "    %s = torch.rand(", input_name.c_str());
+                for (size_t i = 0; i < r->shape.size(); i++)
+                {
+                    fprintf(pyfp, "%d, ", r->shape[i]);
+                }
+                fprintf(pyfp, "dtype=%s)\n", type_to_dtype_string(r->type));
+            }
+        }
+
+        fprintf(pyfp, "    out = []\n");
+        fprintf(pyfp, "\n");
+
+        fprintf(pyfp, "    with ncnn.Net() as net:\n");
+        fprintf(pyfp, "         net.load_param(\"%s\")\n", parampath.c_str());
+        fprintf(pyfp, "         net.load_model(\"%s\")\n", binpath.c_str());
+        fprintf(pyfp, "\n");
+        fprintf(pyfp, "         with net.create_extractor() as ex:\n");
+
+        for (int input_index = 0;; input_index++)
+        {
+            std::string input_name = std::string("in") + std::to_string(input_index);
+            const Operand* r = g.get_operand(input_name);
+            if (!r)
+                break;
+
+            const int batch_index = r->params.at("__batch_index").i;
+            if (batch_index != 233)
+            {
+                fprintf(pyfp, "            ex.input(\"%s\", ncnn.Mat(%s.squeeze(%d).numpy()).clone())\n", input_name.c_str(), input_name.c_str(), batch_index);
+            }
+            else
+            {
+                fprintf(pyfp, "            ex.input(\"%s\", ncnn.Mat(%s.numpy()).clone())\n", input_name.c_str(), input_name.c_str());
+            }
+        }
+
+        fprintf(pyfp, "\n");
+
+        for (int output_index = 0;; output_index++)
+        {
+            std::string output_name = std::string("out") + std::to_string(output_index);
+            const Operand* r = g.get_operand(output_name);
+            if (!r)
+                break;
+
+            fprintf(pyfp, "            _, %s = ex.extract(\"%s\")\n", output_name.c_str(), output_name.c_str());
+
+            const int batch_index = r->params.at("__batch_index").i;
+            if (batch_index != 233)
+            {
+                fprintf(pyfp, "            out.append(torch.from_numpy(np.array(%s)).unsqueeze(%d))\n", output_name.c_str(), batch_index);
+            }
+            else
+            {
+                fprintf(pyfp, "            out.append(torch.from_numpy(np.array(%s)))\n", output_name.c_str());
+            }
+        }
+
+        fprintf(pyfp, "\n");
+
+        fprintf(pyfp, "    if len(out) == 1:\n");
+        fprintf(pyfp, "        return out[0]\n");
+        fprintf(pyfp, "    else:\n");
+        fprintf(pyfp, "        return tuple(out)\n");
+    }
+
+    fclose(pyfp);
+
+    return 0;
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/save_ncnn.h b/tools/pnnx/src/save_ncnn.h
new file mode 100644
index 00000000000..458c1470020
--- /dev/null
+++ b/tools/pnnx/src/save_ncnn.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef PNNX_SAVE_NCNN_H
+#define PNNX_SAVE_NCNN_H
+
+#include "ir.h"
+
+namespace pnnx {
+
+int save_ncnn(const Graph& g, const std::string& parampath, const std::string& binpath, const std::string& pypath, int fp16);
+
+} // namespace pnnx
+
+#endif // PNNX_SAVE_NCNN_H
diff --git a/tools/pnnx/src/save_onnx.cpp b/tools/pnnx/src/save_onnx.cpp
new file mode 100644
index 00000000000..55bb10cf722
--- /dev/null
+++ b/tools/pnnx/src/save_onnx.cpp
@@ -0,0 +1,333 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "save_onnx.h"
+
+#include "onnx.pb.h"
+
+#include <string.h>
+#include <fstream>
+#include <iostream>
+
+namespace pnnx {
+
+// from cxxabi bridge
+extern const char* get_operand_name(const Operand* x);
+extern const char* get_operator_type(const Operator* op);
+extern const char* get_operator_name(const Operator* op);
+extern std::vector<const char*> get_operator_params_keys(const Operator* op);
+extern std::vector<const char*> get_operator_attrs_keys(const Operator* op);
+extern const Parameter& get_operator_param(const Operator* op, const char* key);
+extern const Attribute& get_operator_attr(const Operator* op, const char* key);
+extern const char* get_param_s(const Parameter& p);
+extern std::vector<const char*> get_param_as(const Parameter& p);
+
+static unsigned short float32_to_float16(float value)
+{
+    // 1 : 8 : 23
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+
+    tmp.f = value;
+
+    // 1 : 8 : 23
+    unsigned short sign = (tmp.u & 0x80000000) >> 31;
+    unsigned short exponent = (tmp.u & 0x7F800000) >> 23;
+    unsigned int significand = tmp.u & 0x7FFFFF;
+
+    //     NCNN_LOGE("%d %d %d", sign, exponent, significand);
+
+    // 1 : 5 : 10
+    unsigned short fp16;
+    if (exponent == 0)
+    {
+        // zero or denormal, always underflow
+        fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+    }
+    else if (exponent == 0xFF)
+    {
+        // infinity or NaN
+        fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
+    }
+    else
+    {
+        // normalized
+        short newexp = exponent + (-127 + 15);
+        if (newexp >= 31)
+        {
+            // overflow, return infinity
+            fp16 = (sign << 15) | (0x1F << 10) | 0x00;
+        }
+        else if (newexp <= 0)
+        {
+            // Some normal fp32 cannot be expressed as normal fp16
+            fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+        }
+        else
+        {
+            // normal fp16
+            fp16 = (sign << 15) | (newexp << 10) | (significand >> 13);
+        }
+    }
+
+    return fp16;
+}
+
+int save_onnx(const Graph& g, const char* onnxpath, int fp16)
+{
+    onnx::ModelProto model;
+
+    onnx::GraphProto* gp = model.mutable_graph();
+
+    for (const Operand* x : g.operands)
+    {
+        onnx::ValueInfoProto* vip = gp->add_value_info();
+
+        vip->set_name(get_operand_name(x));
+
+        onnx::TypeProto* tp = vip->mutable_type();
+
+        onnx::TypeProto_Tensor* tpt = tp->mutable_tensor_type();
+
+        switch (x->type)
+        {
+        case 1: // f32
+            tpt->set_elem_type(fp16 ? 10 : 1);
+            break;
+        case 2: // f64
+            tpt->set_elem_type(fp16 ? 10 : 11);
+            break;
+        case 3: // f16
+            tpt->set_elem_type(10);
+            break;
+        case 4: // i32
+            tpt->set_elem_type(6);
+            break;
+        case 5: // i64
+            tpt->set_elem_type(7);
+            break;
+        case 6: // i16
+            tpt->set_elem_type(5);
+            break;
+        case 7: // i8
+            tpt->set_elem_type(3);
+            break;
+        case 8: // u8
+            tpt->set_elem_type(2);
+            break;
+        case 9: // bool
+            tpt->set_elem_type(9);
+            break;
+        case 10: // cp64
+            tpt->set_elem_type(14);
+            break;
+        case 11: // cp128
+            tpt->set_elem_type(15);
+            break;
+        case 12: // cp32
+            tpt->set_elem_type(0);
+            break;
+        default: // null
+            tpt->set_elem_type(0);
+            break;
+        }
+
+        onnx::TensorShapeProto* tsp = tpt->mutable_shape();
+
+        for (auto s : x->shape)
+        {
+            onnx::TensorShapeProto_Dimension* tspd = tsp->add_dim();
+
+            tspd->set_dim_value(s);
+        }
+    }
+
+    for (const Operator* op : g.ops)
+    {
+        onnx::NodeProto* np = gp->add_node();
+
+        np->set_op_type(get_operator_type(op));
+        np->set_name(get_operator_name(op));
+
+        for (const Operand* oprand : op->inputs)
+        {
+            np->add_input(get_operand_name(oprand));
+        }
+
+        for (const Operand* oprand : op->outputs)
+        {
+            np->add_output(get_operand_name(oprand));
+        }
+
+        std::vector<const char*> params_keys = get_operator_params_keys(op);
+        for (const char* param_name : params_keys)
+        {
+            const Parameter& param = get_operator_param(op, param_name);
+
+            onnx::AttributeProto* ap = np->add_attribute();
+
+            ap->set_name(param_name);
+
+            if (param.type == 0)
+            {
+                ap->set_s("None");
+            }
+            if (param.type == 1)
+            {
+                if (param.b)
+                    ap->set_i(1);
+                else
+                    ap->set_i(0);
+            }
+            if (param.type == 2)
+            {
+                ap->set_i(param.i);
+            }
+            if (param.type == 3)
+            {
+                ap->set_f(param.f);
+            }
+            if (param.type == 4)
+            {
+                ap->set_s(get_param_s(param));
+            }
+            if (param.type == 5)
+            {
+                for (auto i : param.ai)
+                {
+                    ap->add_ints(i);
+                }
+            }
+            if (param.type == 6)
+            {
+                for (auto f : param.af)
+                {
+                    ap->add_floats(f);
+                }
+            }
+            if (param.type == 7)
+            {
+                std::vector<const char*> as = get_param_as(param);
+                for (auto s : as)
+                {
+                    ap->add_strings(s);
+                }
+            }
+        }
+
+        std::vector<const char*> attrs_keys = get_operator_attrs_keys(op);
+        for (const char* attr_name : attrs_keys)
+        {
+            onnx::TensorProto* tp = gp->add_initializer();
+
+            tp->set_name(std::string(get_operator_name(op)) + "." + attr_name);
+
+            np->add_input(std::string(get_operator_name(op)) + "." + attr_name);
+
+            const Attribute& attr = get_operator_attr(op, attr_name);
+            for (auto s : attr.shape)
+            {
+                tp->add_dims(s);
+            }
+
+            switch (attr.type)
+            {
+            case 1: // f32
+                tp->set_data_type(fp16 ? 10 : 1);
+                break;
+            case 2: // f64
+                tp->set_data_type(fp16 ? 10 : 11);
+                break;
+            case 3: // f16
+                tp->set_data_type(10);
+                break;
+            case 4: // i32
+                tp->set_data_type(6);
+                break;
+            case 5: // i64
+                tp->set_data_type(7);
+                break;
+            case 6: // i16
+                tp->set_data_type(5);
+                break;
+            case 7: // i8
+                tp->set_data_type(3);
+                break;
+            case 8: // u8
+                tp->set_data_type(2);
+                break;
+            case 9: // bool
+                tp->set_data_type(9);
+                break;
+            case 10: // cp64
+                tp->set_data_type(14);
+                break;
+            case 11: // cp128
+                tp->set_data_type(15);
+                break;
+            case 12: // cp32
+                tp->set_data_type(0);
+                break;
+            default: // null
+                tp->set_data_type(0);
+                break;
+            }
+
+            std::string* d = tp->mutable_raw_data();
+            if (fp16 && attr.type == 1)
+            {
+                // fp32 to fp16
+                const float* p = (const float*)attr.data.data();
+                int len = attr.data.size() / 4;
+                d->resize(len * 2);
+                unsigned short* p_fp16 = (unsigned short*)d->data();
+                for (int i = 0; i < len; i++)
+                {
+                    p_fp16[i] = float32_to_float16(p[i]);
+                }
+            }
+            else if (fp16 && attr.type == 2)
+            {
+                // fp64 to fp16
+                const double* p = (const double*)attr.data.data();
+                int len = attr.data.size() / 4;
+                d->resize(len);
+                unsigned short* p_fp16 = (unsigned short*)d->data();
+                for (int i = 0; i < len; i++)
+                {
+                    p_fp16[i] = float32_to_float16((float)p[i]);
+                }
+            }
+            else
+            {
+                d->resize(attr.data.size());
+                memcpy((void*)d->data(), attr.data.data(), attr.data.size());
+            }
+        }
+    }
+
+    std::fstream output(onnxpath, std::ios::out | std::ios::trunc | std::ios::binary);
+    if (!model.SerializeToOstream(&output))
+    {
+        fprintf(stderr, "write onnx failed\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/save_onnx.h b/tools/pnnx/src/save_onnx.h
new file mode 100644
index 00000000000..9a4099872a6
--- /dev/null
+++ b/tools/pnnx/src/save_onnx.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef PNNX_SAVE_ONNX_H
+#define PNNX_SAVE_ONNX_H
+
+#include "ir.h"
+
+namespace pnnx {
+
+int save_onnx(const Graph& g, const char* onnxpath, int fp16);
+
+} // namespace pnnx
+
+#endif // PNNX_SAVE_ONNX_H
diff --git a/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp b/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp
new file mode 100644
index 00000000000..b74f2ab7a72
--- /dev/null
+++ b/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+const char* get_operand_name(const Operand* x)
+{
+    return x->name.c_str();
+}
+
+const char* get_operator_type(const Operator* op)
+{
+    return op->type.c_str();
+}
+
+const char* get_operator_name(const Operator* op)
+{
+    return op->name.c_str();
+}
+
+std::vector<const char*> get_operator_params_keys(const Operator* op)
+{
+    std::vector<const char*> keys;
+    for (const auto& it : op->params)
+    {
+        const std::string& key = it.first;
+        keys.push_back(key.c_str());
+    }
+    return keys;
+}
+
+std::vector<const char*> get_operator_attrs_keys(const Operator* op)
+{
+    std::vector<const char*> keys;
+    for (const auto& it : op->attrs)
+    {
+        const std::string& key = it.first;
+        keys.push_back(key.c_str());
+    }
+    return keys;
+}
+
+const Parameter& get_operator_param(const Operator* op, const char* key)
+{
+    return op->params.at(key);
+}
+
+const Attribute& get_operator_attr(const Operator* op, const char* key)
+{
+    return op->attrs.at(key);
+}
+
+const char* get_param_s(const Parameter& p)
+{
+    return p.s.c_str();
+}
+
+std::vector<const char*> get_param_as(const Parameter& p)
+{
+    std::vector<const char*> as;
+    for (const auto& s : p.as)
+    {
+        as.push_back(s.c_str());
+    }
+    return as;
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index 50a2d327eeb..8a69446d360 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -30,6 +30,7 @@ pnnx_add_test(F_dropout3d)
 pnnx_add_test(F_elu)
 pnnx_add_test(F_embedding)
 pnnx_add_test(F_feature_alpha_dropout)
+pnnx_add_test(F_fold)
 pnnx_add_test(F_gelu)
 pnnx_add_test(F_glu)
 pnnx_add_test(F_grid_sample)
@@ -70,6 +71,7 @@ pnnx_add_test(F_softsign)
 pnnx_add_test(F_tanh)
 pnnx_add_test(F_tanhshrink)
 pnnx_add_test(F_threshold)
+pnnx_add_test(F_unfold)
 pnnx_add_test(F_upsample_bilinear)
 pnnx_add_test(F_upsample_nearest)
 pnnx_add_test(F_upsample)
@@ -103,7 +105,9 @@ pnnx_add_test(nn_Dropout2d)
 pnnx_add_test(nn_Dropout3d)
 pnnx_add_test(nn_ELU)
 pnnx_add_test(nn_Embedding)
+pnnx_add_test(nn_Fold)
 pnnx_add_test(nn_GELU)
+pnnx_add_test(nn_GLU)
 pnnx_add_test(nn_GroupNorm)
 pnnx_add_test(nn_GRU)
 pnnx_add_test(nn_Hardshrink)
@@ -142,6 +146,7 @@ pnnx_add_test(nn_SELU)
 pnnx_add_test(nn_Sigmoid)
 pnnx_add_test(nn_SiLU)
 pnnx_add_test(nn_Softmax)
+pnnx_add_test(nn_Softmax2d)
 pnnx_add_test(nn_Softmin)
 pnnx_add_test(nn_Softplus)
 pnnx_add_test(nn_Softshrink)
@@ -149,6 +154,7 @@ pnnx_add_test(nn_Softsign)
 pnnx_add_test(nn_Tanh)
 pnnx_add_test(nn_Tanhshrink)
 pnnx_add_test(nn_Threshold)
+pnnx_add_test(nn_Unfold)
 pnnx_add_test(nn_Upsample)
 pnnx_add_test(nn_UpsamplingBilinear2d)
 pnnx_add_test(nn_UpsamplingNearest2d)
@@ -158,10 +164,14 @@ pnnx_add_test(Tensor_contiguous)
 pnnx_add_test(Tensor_index)
 pnnx_add_test(Tensor_masked_fill)
 pnnx_add_test(Tensor_new_empty)
+pnnx_add_test(Tensor_new_full)
+pnnx_add_test(Tensor_new_ones)
+pnnx_add_test(Tensor_new_zeros)
 pnnx_add_test(Tensor_repeat)
 pnnx_add_test(Tensor_reshape)
 pnnx_add_test(Tensor_select)
 pnnx_add_test(Tensor_slice)
+pnnx_add_test(Tensor_slice_copy)
 pnnx_add_test(Tensor_view)
 
 pnnx_add_test(torch_addmm)
@@ -176,7 +186,6 @@ pnnx_add_test(torch_bitwise_xor)
 pnnx_add_test(torch_bmm)
 pnnx_add_test(torch_cat)
 pnnx_add_test(torch_chunk)
-pnnx_add_test(torch_clamp)
 pnnx_add_test(torch_clone)
 pnnx_add_test(torch_complex)
 pnnx_add_test(torch_einsum)
@@ -187,7 +196,6 @@ pnnx_add_test(torch_full_like)
 pnnx_add_test(torch_gather)
 pnnx_add_test(torch_ge)
 pnnx_add_test(torch_gt)
-pnnx_add_test(torch_imag)
 pnnx_add_test(torch_index_select)
 pnnx_add_test(torch_le)
 pnnx_add_test(torch_logsumexp)
@@ -202,7 +210,6 @@ pnnx_add_test(torch_ones)
 pnnx_add_test(torch_ones_like)
 pnnx_add_test(torch_permute)
 pnnx_add_test(torch_prod)
-pnnx_add_test(torch_real)
 pnnx_add_test(torch_scatter_add)
 pnnx_add_test(torch_sum)
 pnnx_add_test(torch_split)
@@ -232,6 +239,36 @@ pnnx_add_test(torch_fft_fft)
 pnnx_add_test(torch_fft_fft2)
 pnnx_add_test(torch_fft_fftn)
 
+pnnx_add_test(torch_abs)
+pnnx_add_test(torch_acos)
+pnnx_add_test(torch_acosh)
+pnnx_add_test(torch_asin)
+pnnx_add_test(torch_asinh)
+pnnx_add_test(torch_atan)
+pnnx_add_test(torch_atanh)
+pnnx_add_test(torch_atan2)
+pnnx_add_test(torch_ceil)
+pnnx_add_test(torch_clamp)
+pnnx_add_test(torch_cos)
+pnnx_add_test(torch_cosh)
+pnnx_add_test(torch_exp)
+pnnx_add_test(torch_floor)
+pnnx_add_test(torch_imag)
+pnnx_add_test(torch_log)
+pnnx_add_test(torch_neg)
+pnnx_add_test(torch_pow)
+pnnx_add_test(torch_real)
+pnnx_add_test(torch_reciprocal)
+pnnx_add_test(torch_rsqrt)
+pnnx_add_test(torch_sign)
+pnnx_add_test(torch_sin)
+pnnx_add_test(torch_sinh)
+pnnx_add_test(torch_sqrt)
+pnnx_add_test(torch_square)
+pnnx_add_test(torch_tan)
+pnnx_add_test(torch_tanh)
+pnnx_add_test(torch_trunc)
+
 pnnx_add_test(convnext_tiny)
 pnnx_add_test(mobilenet_v2)
 pnnx_add_test(mobilenet_v3_small)
@@ -255,12 +292,20 @@ pnnx_add_test(pnnx_fuse_convtranspose2d_batchnorm2d)
 pnnx_add_test(pnnx_fuse_linear_batchnorm1d)
 pnnx_add_test(pnnx_fuse_select_to_unbind)
 pnnx_add_test(pnnx_fuse_slice_to_tensor_split)
+pnnx_add_test(pnnx_fuse_adjacent_reshape)
+pnnx_add_test(pnnx_fuse_pad_conv1d)
+pnnx_add_test(pnnx_fuse_pad_conv2d)
 
 if(Torch_VERSION VERSION_GREATER_EQUAL "1.9")
     pnnx_add_test(F_mish)
     pnnx_add_test(nn_Mish)
 endif()
 
+if(Torch_VERSION VERSION_GREATER_EQUAL "1.10")
+    pnnx_add_test(torch_bitwise_left_shift)
+    pnnx_add_test(torch_bitwise_right_shift)
+endif()
+
 if(Torch_VERSION VERSION_GREATER_EQUAL "1.11")
     pnnx_add_test(torch_fft_ihfft2)
     pnnx_add_test(torch_fft_ihfftn)
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index 95e1320d8e2..272551a8a5a 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -28,8 +28,10 @@ pnnx_ncnn_add_test(F_dropout3d)
 pnnx_ncnn_add_test(F_elu)
 pnnx_ncnn_add_test(F_embedding)
 pnnx_ncnn_add_test(F_feature_alpha_dropout)
+pnnx_ncnn_add_test(F_fold)
 pnnx_ncnn_add_test(F_gelu)
 pnnx_ncnn_add_test(F_glu)
+pnnx_ncnn_add_test(F_grid_sample)
 pnnx_ncnn_add_test(F_group_norm)
 pnnx_ncnn_add_test(F_hardsigmoid)
 pnnx_ncnn_add_test(F_hardswish)
@@ -52,6 +54,7 @@ pnnx_ncnn_add_test(F_sigmoid)
 pnnx_ncnn_add_test(F_silu)
 pnnx_ncnn_add_test(F_softmax)
 pnnx_ncnn_add_test(F_tanh)
+pnnx_ncnn_add_test(F_unfold)
 pnnx_ncnn_add_test(F_upsample_bilinear)
 pnnx_ncnn_add_test(F_upsample_nearest)
 pnnx_ncnn_add_test(F_upsample)
@@ -84,7 +87,9 @@ pnnx_ncnn_add_test(nn_Dropout2d)
 pnnx_ncnn_add_test(nn_Dropout3d)
 pnnx_ncnn_add_test(nn_ELU)
 pnnx_ncnn_add_test(nn_Embedding)
+pnnx_ncnn_add_test(nn_Fold)
 pnnx_ncnn_add_test(nn_GELU)
+pnnx_ncnn_add_test(nn_GLU)
 pnnx_ncnn_add_test(nn_GroupNorm)
 pnnx_ncnn_add_test(nn_GRU)
 pnnx_ncnn_add_test(nn_Hardsigmoid)
@@ -114,7 +119,9 @@ pnnx_ncnn_add_test(nn_SELU)
 pnnx_ncnn_add_test(nn_Sigmoid)
 pnnx_ncnn_add_test(nn_SiLU)
 pnnx_ncnn_add_test(nn_Softmax)
+pnnx_ncnn_add_test(nn_Softmax2d)
 pnnx_ncnn_add_test(nn_Tanh)
+pnnx_ncnn_add_test(nn_Unfold)
 pnnx_ncnn_add_test(nn_Upsample)
 pnnx_ncnn_add_test(nn_UpsamplingBilinear2d)
 pnnx_ncnn_add_test(nn_UpsamplingNearest2d)
@@ -132,7 +139,6 @@ pnnx_ncnn_add_test(torch_amin)
 pnnx_ncnn_add_test(torch_bmm)
 pnnx_ncnn_add_test(torch_cat)
 pnnx_ncnn_add_test(torch_chunk)
-pnnx_ncnn_add_test(torch_clamp)
 pnnx_ncnn_add_test(torch_clone)
 pnnx_ncnn_add_test(torch_einsum)
 pnnx_ncnn_add_test(torch_logsumexp)
@@ -150,6 +156,26 @@ pnnx_ncnn_add_test(torch_transpose)
 pnnx_ncnn_add_test(torch_unbind)
 pnnx_ncnn_add_test(torch_unsqueeze)
 
+pnnx_ncnn_add_test(torch_abs)
+pnnx_ncnn_add_test(torch_acos)
+pnnx_ncnn_add_test(torch_asin)
+pnnx_ncnn_add_test(torch_atan)
+pnnx_ncnn_add_test(torch_ceil)
+pnnx_ncnn_add_test(torch_clamp)
+pnnx_ncnn_add_test(torch_cos)
+pnnx_ncnn_add_test(torch_exp)
+pnnx_ncnn_add_test(torch_floor)
+pnnx_ncnn_add_test(torch_log)
+pnnx_ncnn_add_test(torch_neg)
+pnnx_ncnn_add_test(torch_pow)
+pnnx_ncnn_add_test(torch_reciprocal)
+pnnx_ncnn_add_test(torch_rsqrt)
+pnnx_ncnn_add_test(torch_sin)
+pnnx_ncnn_add_test(torch_sqrt)
+pnnx_ncnn_add_test(torch_square)
+pnnx_ncnn_add_test(torch_tan)
+pnnx_ncnn_add_test(torch_tanh)
+
 pnnx_ncnn_add_test(convnext_tiny)
 pnnx_ncnn_add_test(mobilenet_v2)
 pnnx_ncnn_add_test(mobilenet_v3_small)
diff --git a/tools/pnnx/tests/ncnn/test_F_fold.py b/tools/pnnx/tests/ncnn/test_F_fold.py
new file mode 100644
index 00000000000..54103fdf261
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_F_fold.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = F.fold(x, output_size=22, kernel_size=3)
+        y = F.fold(y, output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        z = F.fold(z, output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2))
+
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 108, 400)
+    y = torch.rand(1, 96, 190)
+    z = torch.rand(1, 36, 120)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_F_fold.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_F_fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]")
+
+    # ncnn inference
+    import test_F_fold_ncnn
+    b = test_F_fold_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_F_glu.py b/tools/pnnx/tests/ncnn/test_F_glu.py
index fbc939ad158..937253c1279 100644
--- a/tools/pnnx/tests/ncnn/test_F_glu.py
+++ b/tools/pnnx/tests/ncnn/test_F_glu.py
@@ -27,7 +27,8 @@ def forward(self, x, y, z):
         z0 = F.glu(z, dim=0)
         z1 = F.glu(z, dim=1)
         z2 = F.glu(z, dim=2)
-        return x0, y0, y1, z0, z1, z2
+        z3 = F.glu(z, dim=-1)
+        return x0, y0, y1, z0, z1, z2, z3
 
 def test():
     net = Model()
@@ -46,7 +47,7 @@ def test():
 
     # torchscript to pnnx
     import os
-    #  os.system("../../src/pnnx test_F_glu.pt")
+    os.system("../../src/pnnx test_F_glu.pt inputshape=[18],[12,16],[24,28,34]")
 
     # ncnn inference
     import test_F_glu_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py
new file mode 100644
index 00000000000..c84d38232b1
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py
@@ -0,0 +1,98 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, xg1, xg2, y, yg1, yg2):
+        # norm to -1 ~ 1
+        xg1 = xg1 * 2 - 1
+        xg2 = xg2 * 2 - 1
+        yg1 = yg1 * 2 - 1
+        yg2 = yg2 * 2 - 1
+
+        x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False)
+        x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False)
+        x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='reflection', align_corners=False)
+        x = F.grid_sample(x, xg2, mode='nearest', padding_mode='zeros', align_corners=False)
+        x = F.grid_sample(x, xg1, mode='nearest', padding_mode='border', align_corners=False)
+        x = F.grid_sample(x, xg2, mode='nearest', padding_mode='reflection', align_corners=False)
+        x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='zeros', align_corners=False)
+        x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='border', align_corners=False)
+        x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='reflection', align_corners=False)
+        x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='zeros', align_corners=True)
+        x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='border', align_corners=True)
+        x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='reflection', align_corners=True)
+        x = F.grid_sample(x, xg1, mode='nearest', padding_mode='zeros', align_corners=True)
+        x = F.grid_sample(x, xg2, mode='nearest', padding_mode='border', align_corners=True)
+        x = F.grid_sample(x, xg1, mode='nearest', padding_mode='reflection', align_corners=True)
+        x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='zeros', align_corners=True)
+        x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='border', align_corners=True)
+        x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='reflection', align_corners=True)
+
+        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False)
+        y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False)
+        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=False)
+        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=False)
+        y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=False)
+        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=False)
+        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=True)
+        y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=True)
+        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=True)
+        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=True)
+        y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=True)
+        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=True)
+
+        return x, y
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 12, 16)
+    xg1 = torch.rand(1, 21, 27, 2)
+    xg2 = torch.rand(1, 12, 16, 2)
+    y = torch.rand(1, 5, 10, 12, 16)
+    yg1 = torch.rand(1, 10, 21, 27, 3)
+    yg2 = torch.rand(1, 10, 12, 16, 3)
+
+    a0, a1 = net(x, xg1, xg2, y, yg1, yg2)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2))
+    mod.save("test_F_grid_sample.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3]")
+
+    # ncnn inference
+    import test_F_grid_sample_ncnn
+    b0, b1 = test_F_grid_sample_ncnn.test_inference()
+
+    return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_F_group_norm.py b/tools/pnnx/tests/ncnn/test_F_group_norm.py
index 0e4710fbbd6..6b034795045 100644
--- a/tools/pnnx/tests/ncnn/test_F_group_norm.py
+++ b/tools/pnnx/tests/ncnn/test_F_group_norm.py
@@ -20,29 +20,37 @@ class Model(nn.Module):
     def __init__(self):
         super(Model, self).__init__()
 
+        self.w3 = nn.Parameter(torch.rand(16))
+        self.b3 = nn.Parameter(torch.rand(16))
+        self.w4 = nn.Parameter(torch.rand(12))
+        self.b4 = nn.Parameter(torch.rand(12))
         self.w5 = nn.Parameter(torch.rand(32))
         self.b5 = nn.Parameter(torch.rand(32))
 
-    def forward(self, z):
+    def forward(self, x, y, z):
+        x = F.group_norm(x, 4, self.w3, self.b3)
+        y = F.group_norm(y, 6, self.w4, self.b4)
         z = F.group_norm(z, 8, self.w5, self.b5, eps=1e-2)
-        return z
+        return x, y, z
 
 def test():
     net = Model()
     net.eval()
 
     torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(1, 12, 16)
     z = torch.rand(1, 32, 12, 16)
 
-    a = net(z)
+    a = net(x, y, z)
 
     # export torchscript
-    mod = torch.jit.trace(net, z)
+    mod = torch.jit.trace(net, (x, y, z))
     mod.save("test_F_group_norm.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_F_group_norm.pt inputshape=[1,32,12,16]")
+    os.system("../../src/pnnx test_F_group_norm.pt inputshape=[1,16],[1,12,16],[1,32,12,16]")
 
     # ncnn inference
     import test_F_group_norm_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_F_softmax.py b/tools/pnnx/tests/ncnn/test_F_softmax.py
index f7d612eda64..83a5324f49d 100644
--- a/tools/pnnx/tests/ncnn/test_F_softmax.py
+++ b/tools/pnnx/tests/ncnn/test_F_softmax.py
@@ -24,7 +24,8 @@ def forward(self, x, y, z):
         x = F.softmax(x, 0)
         y = F.softmax(y, 1)
         z = F.softmax(z, 2)
-        return x, y, z
+        z2 = F.softmax(z, -1)
+        return x, y, z, z2
 
 def test():
     net = Model()
diff --git a/tools/pnnx/tests/ncnn/test_F_unfold.py b/tools/pnnx/tests/ncnn/test_F_unfold.py
new file mode 100644
index 00000000000..e8e1a603cc3
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_F_unfold.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        x0 = F.unfold(x, kernel_size=3)
+        x1 = F.unfold(x, kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        x2 = F.unfold(x, kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2))
+
+        return x0, x1, x2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 64, 64)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_F_unfold.pt")
+
+    # torchscript to ncnn
+    import os
+    os.system("../../src/pnnx test_F_unfold.pt inputshape=[1,12,64,64]")
+
+    # ncnn inference
+    import test_F_unfold_ncnn
+    b = test_F_unfold_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_nn_Fold.py b/tools/pnnx/tests/ncnn/test_nn_Fold.py
new file mode 100644
index 00000000000..8b07b2b5d38
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_nn_Fold.py
@@ -0,0 +1,67 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.fold_0 = nn.Fold(output_size=22, kernel_size=3)
+        self.fold_1 = nn.Fold(output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        self.fold_2 = nn.Fold(output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2))
+
+    def forward(self, x, y, z):
+        x = self.fold_0(x)
+        y = self.fold_1(y)
+        z = self.fold_2(z)
+
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 108, 400)
+    y = torch.rand(1, 96, 190)
+    z = torch.rand(1, 36, 120)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_nn_Fold.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]")
+
+    # ncnn inference
+    import test_nn_Fold_ncnn
+    b = test_nn_Fold_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_nn_GLU.py b/tools/pnnx/tests/ncnn/test_nn_GLU.py
new file mode 100644
index 00000000000..49a018ee2c4
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_nn_GLU.py
@@ -0,0 +1,67 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.GLU(dim=0)
+        self.act_1 = nn.GLU(dim=1)
+        self.act_2 = nn.GLU(dim=2)
+        self.act_3 = nn.GLU(dim=-1)
+
+    def forward(self, x, y, z):
+        x = self.act_0(x)
+        y = self.act_1(y)
+        z = self.act_2(z)
+        z2 = self.act_3(z)
+        return x, y, z, z2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(12)
+    y = torch.rand(12, 64)
+    z = torch.rand(12, 24, 64)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_nn_GLU.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_GLU.pt inputshape=[12],[12,64],[12,24,64]")
+
+    # ncnn inference
+    import test_nn_GLU_ncnn
+    b = test_nn_GLU_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py b/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py
index 71f7d684fc5..c016e7ae250 100644
--- a/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py
+++ b/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py
@@ -24,34 +24,47 @@ def __init__(self):
         self.gn_1 = nn.GroupNorm(num_groups=12, num_channels=12, eps=1e-2, affine=True)
         self.gn_2 = nn.GroupNorm(num_groups=1, num_channels=12, eps=1e-4, affine=True)
 
-    def forward(self, x):
+    def forward(self, x, y, z):
         x = self.gn_0(x)
         x = self.gn_1(x)
         x = self.gn_2(x)
-        return x
+
+        y = self.gn_0(y)
+        y = self.gn_1(y)
+        y = self.gn_2(y)
+
+        z = self.gn_0(z)
+        z = self.gn_1(z)
+        z = self.gn_2(z)
+        return x, y, z
 
 def test():
     net = Model()
     net.eval()
 
     torch.manual_seed(0)
-    x = torch.rand(1, 12, 24, 64)
+    x = torch.rand(1, 12, 64)
+    y = torch.rand(1, 12, 24, 64)
+    z = torch.rand(1, 12, 24, 32, 64)
 
-    a0 = net(x)
+    a = net(x, y, z)
 
     # export torchscript
-    mod = torch.jit.trace(net, x)
+    mod = torch.jit.trace(net, (x, y, z))
     mod.save("test_nn_GroupNorm.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_nn_GroupNorm.pt inputshape=[1,12,24,64]")
+    os.system("../../src/pnnx test_nn_GroupNorm.pt inputshape=[1,12,64],[1,12,24,64],[1,12,24,32,64]")
 
     # ncnn inference
     import test_nn_GroupNorm_ncnn
-    b0 = test_nn_GroupNorm_ncnn.test_inference()
+    b = test_nn_GroupNorm_ncnn.test_inference()
 
-    return torch.allclose(a0, b0, 1e-4, 1e-4)
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/ncnn/test_nn_LSTM.py b/tools/pnnx/tests/ncnn/test_nn_LSTM.py
index 575d44aacd1..a51f5e94054 100644
--- a/tools/pnnx/tests/ncnn/test_nn_LSTM.py
+++ b/tools/pnnx/tests/ncnn/test_nn_LSTM.py
@@ -22,15 +22,15 @@ def __init__(self):
 
         self.lstm_0_0 = nn.LSTM(input_size=32, hidden_size=16)
         self.lstm_0_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False)
-        self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True)
-        self.lstm_0_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True)
-        self.lstm_0_4 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True)
+        self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10)
+        self.lstm_0_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10)
+        self.lstm_0_4 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10)
 
         self.lstm_1_0 = nn.LSTM(input_size=25, hidden_size=16, batch_first=True)
         self.lstm_1_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False, batch_first=True)
-        self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True)
-        self.lstm_1_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True)
-        self.lstm_1_4 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True)
+        self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10)
+        self.lstm_1_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10)
+        self.lstm_1_4 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10)
 
     def forward(self, x, y):
         x = x.permute(1, 0, 2)
@@ -38,14 +38,14 @@ def forward(self, x, y):
         x0, _ = self.lstm_0_0(x)
         x1, _ = self.lstm_0_1(x0)
         x2, (h0, c0) = self.lstm_0_2(x1)
-        x3, (h1, c1) = self.lstm_0_3(x1, (h0, c0))
-        x4, _ = self.lstm_0_4(x1, (h1, c1))
+        x3, (h1, c1) = self.lstm_0_3(x2, (h0, c0))
+        x4, _ = self.lstm_0_4(x3, (h1, c1))
 
         y0, _ = self.lstm_1_0(y)
         y1, _ = self.lstm_1_1(y0)
         y2, (h2, c2) = self.lstm_1_2(y1)
-        y3, (h3, c3) = self.lstm_1_3(y1, (h2, c2))
-        y4, _ = self.lstm_1_4(y1, (h3, c3))
+        y3, (h3, c3) = self.lstm_1_3(y2, (h2, c2))
+        y4, _ = self.lstm_1_4(y3, (h3, c3))
 
         x2 = x2.permute(1, 0, 2)
         x3 = x3.permute(1, 0, 2)
diff --git a/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py b/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py
index 05a4f2c0b04..a35d05b0f28 100644
--- a/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py
+++ b/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py
@@ -22,40 +22,61 @@ def __init__(self):
         super(Model, self).__init__()
 
         self.attention_0_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4)
+        self.attention_0_1 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20)
 
         if version.parse(torch.__version__) >= version.parse('1.9'):
-            self.attention_1_0 = nn.MultiheadAttention(embed_dim=40, num_heads=4, batch_first=True)
+            self.attention_1_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4, batch_first=True)
+            self.attention_1_1 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20, batch_first=True)
 
-    def forward(self, x, y):
-        x0, _ = self.attention_0_0(x, x, x)
+    def forward(self, xq, xk, xv, yq, yk, yv):
+        x0, _ = self.attention_0_0(xq, xq, xq)
+        x1, _ = self.attention_0_0(xq, xk, xv)
+        x2, _ = self.attention_0_0(xq, xk, xk)
+        x3, _ = self.attention_0_1(yq, yk, yv)
 
         if version.parse(torch.__version__) < version.parse('1.9'):
-            return x0
+            return x0, x1, x2, x3
 
-        y0, _ = self.attention_1_0(y, y, y)
+        xq = xq.transpose(0, 1)
+        xk = xk.transpose(0, 1)
+        xv = xv.transpose(0, 1)
+        yq = yq.transpose(0, 1)
+        yk = yk.transpose(0, 1)
+        yv = yv.transpose(0, 1)
 
-        return x0, y0
+        y0, _ = self.attention_1_0(xq, xq, xq)
+        y1, _ = self.attention_1_0(xq, xk, xv)
+        y2, _ = self.attention_1_0(xq, xk, xk)
+        y3, _ = self.attention_1_1(yq, yk, yv)
+
+        return x0, x1, x2, x3, y0, y1, y2, y3
 
 def test():
+    torch.set_grad_enabled(False)
+
     net = Model().half().float()
     net.eval()
 
     torch.manual_seed(0)
-    x = torch.rand(1, 1, 64)
-    y = torch.rand(1, 15, 40)
+    xq = torch.rand(20, 1, 64)
+    xk = torch.rand(20, 1, 64)
+    xv = torch.rand(20, 1, 64)
+    yq = torch.rand(15, 1, 40)
+    yk = torch.rand(24, 1, 30)
+    yv = torch.rand(24, 1, 20)
 
-    a = net(x, y)
+    a = net(xq, xk, xv, yq, yk, yv)
 
     # export torchscript
     if version.parse(torch.__version__) >= version.parse('1.12.0'):
-        mod = torch.jit.trace(net, (x, y), check_trace=False)
+        mod = torch.jit.trace(net, (xq, xk, xv, yq, yk, yv), check_trace=False)
     else:
-        mod = torch.jit.trace(net, (x, y))
+        mod = torch.jit.trace(net, (xq, xk, xv, yq, yk, yv))
     mod.save("test_nn_MultiheadAttention.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_nn_MultiheadAttention.pt inputshape=[1,1,64],[1,15,40]")
+    os.system("../../src/pnnx test_nn_MultiheadAttention.pt inputshape=[20,1,64],[20,1,64],[20,1,64],[15,1,40],[24,1,30],[24,1,20]")
 
     # ncnn inference
     import test_nn_MultiheadAttention_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_nn_Softmax.py b/tools/pnnx/tests/ncnn/test_nn_Softmax.py
index aa9e3b737a8..d4ca3df0ff2 100644
--- a/tools/pnnx/tests/ncnn/test_nn_Softmax.py
+++ b/tools/pnnx/tests/ncnn/test_nn_Softmax.py
@@ -23,12 +23,14 @@ def __init__(self):
         self.act_0 = nn.Softmax(dim=0)
         self.act_1 = nn.Softmax(dim=1)
         self.act_2 = nn.Softmax(dim=2)
+        self.act_3 = nn.Softmax(dim=-1)
 
     def forward(self, x, y, z):
         x = self.act_0(x)
         y = self.act_1(y)
         z = self.act_2(z)
-        return x, y, z
+        z2 = self.act_3(z)
+        return x, y, z, z2
 
 def test():
     net = Model()
diff --git a/tools/pnnx/tests/ncnn/test_nn_Softmax2d.py b/tools/pnnx/tests/ncnn/test_nn_Softmax2d.py
new file mode 100644
index 00000000000..c92537e9034
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_nn_Softmax2d.py
@@ -0,0 +1,56 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softmax2d()
+
+    def forward(self, x):
+        x = self.act_0(x)
+        return x
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 24, 64)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_nn_Softmax2d.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softmax2d.pt inputshape=[1,12,24,64]")
+
+    # ncnn inference
+    import test_nn_Softmax2d_ncnn
+    b = test_nn_Softmax2d_ncnn.test_inference()
+
+    return torch.allclose(a, b, 1e-4, 1e-4)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_nn_Unfold.py b/tools/pnnx/tests/ncnn/test_nn_Unfold.py
new file mode 100644
index 00000000000..8d618f76150
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_nn_Unfold.py
@@ -0,0 +1,65 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.unfold_0 = nn.Unfold(kernel_size=3)
+        self.unfold_1 = nn.Unfold(kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        self.unfold_2 = nn.Unfold(kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2))
+
+    def forward(self, x):
+        x0 = self.unfold_0(x)
+        x1 = self.unfold_1(x)
+        x2 = self.unfold_2(x)
+
+        return x0, x1, x2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 64, 64)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_nn_Unfold.pt")
+
+    # torchscript to ncnn
+    import os
+    os.system("../../src/pnnx test_nn_Unfold.pt inputshape=[1,12,64,64]")
+
+    # ncnn inference
+    import test_nn_Unfold_ncnn
+    b = test_nn_Unfold_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_abs.py b/tools/pnnx/tests/ncnn/test_torch_abs.py
new file mode 100644
index 00000000000..8d824a11b1e
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_abs.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.abs(x - 0.5)
+        y = torch.abs(y - 0.5)
+        z = torch.abs(z - 0.5)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_abs.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_abs.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_abs_ncnn
+    b = test_torch_abs_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_acos.py b/tools/pnnx/tests/ncnn/test_torch_acos.py
new file mode 100644
index 00000000000..4fc0e43a37a
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_acos.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.acos(x)
+        y = torch.acos(y)
+        z = torch.acos(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_acos.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_acos.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_acos_ncnn
+    b = test_torch_acos_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_asin.py b/tools/pnnx/tests/ncnn/test_torch_asin.py
new file mode 100644
index 00000000000..24099e59214
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_asin.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.asin(x)
+        y = torch.asin(y)
+        z = torch.asin(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_asin.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_asin.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_asin_ncnn
+    b = test_torch_asin_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_atan.py b/tools/pnnx/tests/ncnn/test_torch_atan.py
new file mode 100644
index 00000000000..bef6aae5891
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_atan.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.atan(x)
+        y = torch.atan(y)
+        z = torch.atan(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_atan.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_atan.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_atan_ncnn
+    b = test_torch_atan_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_ceil.py b/tools/pnnx/tests/ncnn/test_torch_ceil.py
new file mode 100644
index 00000000000..4ee628adb96
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_ceil.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.ceil(x * 10)
+        y = torch.ceil(y * 10)
+        z = torch.ceil(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_ceil.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_ceil.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_ceil_ncnn
+    b = test_torch_ceil_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_cos.py b/tools/pnnx/tests/ncnn/test_torch_cos.py
new file mode 100644
index 00000000000..f32b1ff4e63
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_cos.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.cos(x)
+        y = torch.cos(y)
+        z = torch.cos(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_cos.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_cos.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_cos_ncnn
+    b = test_torch_cos_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_exp.py b/tools/pnnx/tests/ncnn/test_torch_exp.py
new file mode 100644
index 00000000000..5e608687c40
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_exp.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.exp(x)
+        y = torch.exp(y)
+        z = torch.exp(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_exp.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_exp.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_exp_ncnn
+    b = test_torch_exp_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_floor.py b/tools/pnnx/tests/ncnn/test_torch_floor.py
new file mode 100644
index 00000000000..e100d96de19
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_floor.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.floor(x * 10)
+        y = torch.floor(y * 10)
+        z = torch.floor(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_floor.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_floor.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_floor_ncnn
+    b = test_torch_floor_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_log.py b/tools/pnnx/tests/ncnn/test_torch_log.py
new file mode 100644
index 00000000000..a3583f47273
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_log.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.log(x)
+        y = torch.log(y)
+        z = torch.log(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_log.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_log.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_log_ncnn
+    b = test_torch_log_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_neg.py b/tools/pnnx/tests/ncnn/test_torch_neg.py
new file mode 100644
index 00000000000..75197fa16fd
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_neg.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.neg(x - 0.5)
+        y = torch.neg(y - 0.5)
+        z = torch.neg(z - 0.5)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_neg.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_neg.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_neg_ncnn
+    b = test_torch_neg_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_pow.py b/tools/pnnx/tests/ncnn/test_torch_pow.py
new file mode 100644
index 00000000000..26c850cbaad
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_pow.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.pow(x, y)
+        out1 = torch.pow(y, y)
+        out2 = torch.pow(z, torch.ones_like(z) + 0.5)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_pow.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_pow.pt inputshape=[3,16],[3,16],[5,9,3]")
+
+    # ncnn inference
+    import test_torch_pow_ncnn
+    b = test_torch_pow_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_reciprocal.py b/tools/pnnx/tests/ncnn/test_torch_reciprocal.py
new file mode 100644
index 00000000000..83f2c89c96b
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_reciprocal.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.reciprocal(x)
+        y = torch.reciprocal(y)
+        z = torch.reciprocal(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_reciprocal.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_reciprocal.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_reciprocal_ncnn
+    b = test_torch_reciprocal_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_rsqrt.py b/tools/pnnx/tests/ncnn/test_torch_rsqrt.py
new file mode 100644
index 00000000000..b3a34c70d51
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_rsqrt.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.rsqrt(x)
+        y = torch.rsqrt(y)
+        z = torch.rsqrt(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_rsqrt.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_rsqrt.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_rsqrt_ncnn
+    b = test_torch_rsqrt_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_sin.py b/tools/pnnx/tests/ncnn/test_torch_sin.py
new file mode 100644
index 00000000000..d984a269e1c
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_sin.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.sin(x)
+        y = torch.sin(y)
+        z = torch.sin(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_sin.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_sin.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_sin_ncnn
+    b = test_torch_sin_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_sqrt.py b/tools/pnnx/tests/ncnn/test_torch_sqrt.py
new file mode 100644
index 00000000000..bfd5ddb3d2a
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_sqrt.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.sqrt(x)
+        y = torch.sqrt(y)
+        z = torch.sqrt(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_sqrt.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_sqrt.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_sqrt_ncnn
+    b = test_torch_sqrt_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_square.py b/tools/pnnx/tests/ncnn/test_torch_square.py
new file mode 100644
index 00000000000..d5b5c8af910
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_square.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.square(x)
+        y = torch.square(y)
+        z = torch.square(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_square.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_square.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_square_ncnn
+    b = test_torch_square_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_squeeze.py b/tools/pnnx/tests/ncnn/test_torch_squeeze.py
index 339cebe8f88..807997677e4 100644
--- a/tools/pnnx/tests/ncnn/test_torch_squeeze.py
+++ b/tools/pnnx/tests/ncnn/test_torch_squeeze.py
@@ -35,7 +35,7 @@ def test():
     x = torch.rand(1, 16)
     y = torch.rand(3, 1)
     z = torch.rand(5, 1, 11)
-    w = torch.rand(5, 9, 1)
+    w = torch.rand(5, 9, 1, 33)
 
     a = net(x, y, z, w)
 
@@ -45,7 +45,7 @@ def test():
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_torch_squeeze.pt inputshape=[1,16],[3,1],[5,1,11],[5,9,1]")
+    os.system("../../src/pnnx test_torch_squeeze.pt inputshape=[1,16],[3,1],[5,1,11],[5,9,1,33]")
 
     # ncnn inference
     import test_torch_squeeze_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_torch_tan.py b/tools/pnnx/tests/ncnn/test_torch_tan.py
new file mode 100644
index 00000000000..b97bad2fd9a
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_tan.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.tan(x)
+        y = torch.tan(y)
+        z = torch.tan(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_tan.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_tan.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_tan_ncnn
+    b = test_torch_tan_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_tanh.py b/tools/pnnx/tests/ncnn/test_torch_tanh.py
new file mode 100644
index 00000000000..10b1b72ffcd
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_tanh.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.tanh(x)
+        y = torch.tanh(y)
+        z = torch.tanh(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_tanh.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_tanh.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_tanh_ncnn
+    b = test_torch_tanh_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py b/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py
index baf12171075..b15a43419e4 100644
--- a/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py
+++ b/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py
@@ -20,12 +20,14 @@ class Model(nn.Module):
     def __init__(self):
         super(Model, self).__init__()
 
-    def forward(self, x, y):
+    def forward(self, x, y, z):
         x0 = torch.unsqueeze(x, 0)
         x1 = torch.unsqueeze(x, 1)
         y0 = torch.unsqueeze(y, 1)
         y1 = torch.unsqueeze(y, -1)
-        return x0, x1, y0, y1
+        z0 = torch.unsqueeze(z, 0)
+        z1 = torch.unsqueeze(z, -2)
+        return x0, x1, y0, y1, z0, z1
 
 def test():
     net = Model()
@@ -34,16 +36,17 @@ def test():
     torch.manual_seed(0)
     x = torch.rand(16)
     y = torch.rand(9, 11)
+    z = torch.rand(4, 6, 7)
 
-    a = net(x, y)
+    a = net(x, y, z)
 
     # export torchscript
-    mod = torch.jit.trace(net, (x, y))
+    mod = torch.jit.trace(net, (x, y, z))
     mod.save("test_torch_unsqueeze.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_torch_unsqueeze.pt inputshape=[16],[9,11]")
+    os.system("../../src/pnnx test_torch_unsqueeze.pt inputshape=[16],[9,11],[4,6,7]")
 
     # ncnn inference
     import test_torch_unsqueeze_ncnn
diff --git a/tools/pnnx/tests/test_F_fold.py b/tools/pnnx/tests/test_F_fold.py
new file mode 100644
index 00000000000..68c5b566d56
--- /dev/null
+++ b/tools/pnnx/tests/test_F_fold.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = F.fold(x, output_size=22, kernel_size=3)
+        y = F.fold(y, output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        z = F.fold(z, output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2))
+
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 108, 400)
+    y = torch.rand(1, 96, 190)
+    z = torch.rand(1, 36, 120)
+
+    a0, a1, a2 = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_F_fold.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_F_fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]")
+
+    # pnnx inference
+    import test_F_fold_pnnx
+    b0, b1, b2 = test_F_fold_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_F_glu.py b/tools/pnnx/tests/test_F_glu.py
index ba843287566..0c186868344 100644
--- a/tools/pnnx/tests/test_F_glu.py
+++ b/tools/pnnx/tests/test_F_glu.py
@@ -38,7 +38,7 @@ def test():
     y = torch.rand(12, 16)
     z = torch.rand(24, 28, 34)
 
-    x0, y0, y1, z0, z1, z2= net(x, y, z)
+    x0, y0, y1, z0, z1, z2 = net(x, y, z)
 
     # export torchscript
     mod = torch.jit.trace(net, (x, y, z))
@@ -46,13 +46,11 @@ def test():
 
     # torchscript to pnnx
     import os
-    os.system("../src/pnnx test_F_glu.pt")
+    os.system("../src/pnnx test_F_glu.pt inputshape=[18],[12,16],[24,28,34]")
 
     # pnnx inference
     import test_F_glu_pnnx
-    m = test_F_glu_pnnx.Model()
-    m.eval()
-    x0p, y0p, y1p, z0p, z1p, z2p = m(x, y, z)
+    x0p, y0p, y1p, z0p, z1p, z2p = test_F_glu_pnnx.test_inference()
 
     return torch.equal(x0, x0p) and torch.equal(y0, y0p) and torch.equal(y1, y1p) \
             and torch.equal(z0, z0p) and torch.equal(z1, z1p) and torch.equal(z2, z2p)
diff --git a/tools/pnnx/tests/test_F_grid_sample.py b/tools/pnnx/tests/test_F_grid_sample.py
index ae4ed354cdf..8cb6d214568 100644
--- a/tools/pnnx/tests/test_F_grid_sample.py
+++ b/tools/pnnx/tests/test_F_grid_sample.py
@@ -21,6 +21,12 @@ def __init__(self):
         super(Model, self).__init__()
 
     def forward(self, x, xg1, xg2, y, yg1, yg2):
+        # norm to -1 ~ 1
+        xg1 = xg1 * 2 - 1
+        xg2 = xg2 * 2 - 1
+        yg1 = yg1 * 2 - 1
+        yg2 = yg2 * 2 - 1
+
         x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False)
         x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False)
         x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='reflection', align_corners=False)
diff --git a/tools/pnnx/tests/test_F_unfold.py b/tools/pnnx/tests/test_F_unfold.py
new file mode 100644
index 00000000000..51f19a4f48a
--- /dev/null
+++ b/tools/pnnx/tests/test_F_unfold.py
@@ -0,0 +1,58 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        x0 = F.unfold(x, kernel_size=3)
+        x1 = F.unfold(x, kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        x2 = F.unfold(x, kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2))
+
+        return x0, x1, x2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 64, 64)
+
+    a0, a1, a2 = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_F_unfold.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_F_unfold.pt inputshape=[1,12,64,64]")
+
+    # pnnx inference
+    import test_F_unfold_pnnx
+    b0, b1, b2 = test_F_unfold_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_Tensor_new_full.py b/tools/pnnx/tests/test_Tensor_new_full.py
new file mode 100644
index 00000000000..f6855f201a7
--- /dev/null
+++ b/tools/pnnx/tests/test_Tensor_new_full.py
@@ -0,0 +1,62 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        out0 = x.new_full((2,2), 1.5)
+        out1 = x.new_full((3,), 3)
+        out2 = x.new_full((4,5,6,7,8), -0.5)
+        out3 = x.new_full((1,2,1), 0)
+        out4 = x.new_full((3,3,3,3), 1)
+        return out0, out1, out2, out3, out4
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_Tensor_new_full.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_Tensor_new_full.pt inputshape=[1,16]")
+
+    # pnnx inference
+    import test_Tensor_new_full_pnnx
+    b = test_Tensor_new_full_pnnx.test_inference()
+
+    # test shape only for uninitialized data
+    for a0, b0 in zip(a, b):
+        if not a0.shape == b0.shape:
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_Tensor_new_ones.py b/tools/pnnx/tests/test_Tensor_new_ones.py
new file mode 100644
index 00000000000..b1ee76b13c5
--- /dev/null
+++ b/tools/pnnx/tests/test_Tensor_new_ones.py
@@ -0,0 +1,62 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        out0 = x.new_ones((2,2))
+        out1 = x.new_ones(3)
+        out2 = x.new_ones((4,5,6,7,8))
+        out3 = x.new_ones((1,2,1))
+        out4 = x.new_ones((3,3,3,3))
+        return out0, out1, out2, out3, out4
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_Tensor_new_ones.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_Tensor_new_ones.pt inputshape=[1,16]")
+
+    # pnnx inference
+    import test_Tensor_new_ones_pnnx
+    b = test_Tensor_new_ones_pnnx.test_inference()
+
+    # test shape only for uninitialized data
+    for a0, b0 in zip(a, b):
+        if not a0.shape == b0.shape:
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_Tensor_new_zeros.py b/tools/pnnx/tests/test_Tensor_new_zeros.py
new file mode 100644
index 00000000000..abe87abbc4a
--- /dev/null
+++ b/tools/pnnx/tests/test_Tensor_new_zeros.py
@@ -0,0 +1,62 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        out0 = x.new_zeros((2,2))
+        out1 = x.new_zeros(3)
+        out2 = x.new_zeros((4,5,6,7,8))
+        out3 = x.new_zeros((1,2,1))
+        out4 = x.new_zeros((3,3,3,3))
+        return out0, out1, out2, out3, out4
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_Tensor_new_zeros.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_Tensor_new_zeros.pt inputshape=[1,16]")
+
+    # pnnx inference
+    import test_Tensor_new_zeros_pnnx
+    b = test_Tensor_new_zeros_pnnx.test_inference()
+
+    # test shape only for uninitialized data
+    for a0, b0 in zip(a, b):
+        if not a0.shape == b0.shape:
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_Tensor_slice_copy.py b/tools/pnnx/tests/test_Tensor_slice_copy.py
new file mode 100644
index 00000000000..e3c76a2b867
--- /dev/null
+++ b/tools/pnnx/tests/test_Tensor_slice_copy.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        x = x.clone()
+        x[2:10,...] += 1
+        x[...,1] = x[...,-1] * 3
+        y = x.clone()
+        x[:,:,3,::2].clamp_(0, 0.5)
+        x[:,:,3,::2] = x[:,:,3,::2].exp_()
+        x[:,:,::2,:] = y[:,:,::2,:].pow(2)
+        x[:,:,:,:] = x[:,:,:,:] / 2
+        return x
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(18, 15, 19, 20)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_Tensor_slice_copy.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_Tensor_slice_copy.pt inputshape=[18,15,19,20]")
+
+    # pnnx inference
+    import test_Tensor_slice_copy_pnnx
+    b = test_Tensor_slice_copy_pnnx.test_inference()
+
+    return torch.equal(a, b)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_nn_Fold.py b/tools/pnnx/tests/test_nn_Fold.py
new file mode 100644
index 00000000000..8f53639db2a
--- /dev/null
+++ b/tools/pnnx/tests/test_nn_Fold.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.fold_0 = nn.Fold(output_size=22, kernel_size=3)
+        self.fold_1 = nn.Fold(output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        self.fold_2 = nn.Fold(output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2))
+
+    def forward(self, x, y, z):
+        x = self.fold_0(x)
+        y = self.fold_1(y)
+        z = self.fold_2(z)
+
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 108, 400)
+    y = torch.rand(1, 96, 190)
+    z = torch.rand(1, 36, 120)
+
+    a0, a1, a2 = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_nn_Fold.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_nn_Fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]")
+
+    # pnnx inference
+    import test_nn_Fold_pnnx
+    b0, b1, b2 = test_nn_Fold_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_nn_GLU.py b/tools/pnnx/tests/test_nn_GLU.py
new file mode 100644
index 00000000000..0643f364355
--- /dev/null
+++ b/tools/pnnx/tests/test_nn_GLU.py
@@ -0,0 +1,79 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.glu0 = nn.GLU(dim=0)
+        self.glu1 = nn.GLU(dim=1)
+        self.glu2 = nn.GLU(dim=2)
+
+    def forward(self, x, y, z):
+        x0 = self.glu0(x)
+
+        y0 = self.glu0(y)
+        y1 = self.glu1(y)
+
+        z0 = self.glu0(z)
+        z1 = self.glu1(z)
+        z2 = self.glu2(z)
+        return x0, y0, y1, z0, z1, z2
+
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(18)
+    y = torch.rand(12, 16)
+    z = torch.rand(24, 28, 34)
+
+    x0, y0, y1, z0, z1, z2 = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_nn_GLU.pt")
+
+    # torchscript to pnnx
+    import os
+
+    os.system("../src/pnnx test_nn_GLU.pt inputshape=[18],[12,16],[24,28,34]")
+
+    # pnnx inference
+    import test_nn_GLU_pnnx
+
+    x0p, y0p, y1p, z0p, z1p, z2p = test_nn_GLU_pnnx.test_inference()
+
+    return (
+        torch.equal(x0, x0p)
+        and torch.equal(y0, y0p)
+        and torch.equal(y1, y1p)
+        and torch.equal(z0, z0p)
+        and torch.equal(z1, z1p)
+        and torch.equal(z2, z2p)
+    )
+
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_nn_LSTM.py b/tools/pnnx/tests/test_nn_LSTM.py
index 36274c8fc38..33c54219050 100644
--- a/tools/pnnx/tests/test_nn_LSTM.py
+++ b/tools/pnnx/tests/test_nn_LSTM.py
@@ -22,24 +22,24 @@ def __init__(self):
 
         self.lstm_0_0 = nn.LSTM(input_size=32, hidden_size=16)
         self.lstm_0_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False)
-        self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True)
-        self.lstm_0_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True)
+        self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10)
+        self.lstm_0_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10)
 
         self.lstm_1_0 = nn.LSTM(input_size=25, hidden_size=16, batch_first=True)
         self.lstm_1_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False, batch_first=True)
-        self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True)
-        self.lstm_1_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True)
+        self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10)
+        self.lstm_1_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10)
 
     def forward(self, x, y):
         x0, (h0, c0) = self.lstm_0_0(x)
         x1, (h1, c1) = self.lstm_0_1(x0)
         x2, (h2, c2) = self.lstm_0_2(x1)
-        x3, (h3, c3) = self.lstm_0_3(x1, (h2, c2))
+        x3, (h3, c3) = self.lstm_0_3(x2, (h2, c2))
 
         y0, (h4, c4) = self.lstm_1_0(y)
         y1, (h5, c5) = self.lstm_1_1(y0)
         y2, (h6, c6) = self.lstm_1_2(y1)
-        y3, (h7, c7) = self.lstm_1_3(y1, (h6, c6))
+        y3, (h7, c7) = self.lstm_1_3(y2, (h6, c6))
         return x2, x3, h0, h1, h2, h3, c0, c1, c2, c3, y2, y3, h4, h5, h6, h7, c4, c5, c6, c7
 
 def test():
diff --git a/tools/pnnx/tests/test_nn_MultiheadAttention.py b/tools/pnnx/tests/test_nn_MultiheadAttention.py
index 67dabab9532..cc222621c20 100644
--- a/tools/pnnx/tests/test_nn_MultiheadAttention.py
+++ b/tools/pnnx/tests/test_nn_MultiheadAttention.py
@@ -24,31 +24,61 @@ def __init__(self):
         self.attention_0_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4)
         self.attention_0_1 = nn.MultiheadAttention(embed_dim=64, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False)
         self.attention_0_2 = nn.MultiheadAttention(embed_dim=64, num_heads=16, bias=True, add_bias_kv=True, add_zero_attn=True)
+
         self.attention_0_3 = nn.MultiheadAttention(embed_dim=32, num_heads=8, bias=True)
 
+        self.attention_0_4 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20)
+        self.attention_0_5 = nn.MultiheadAttention(embed_dim=40, num_heads=8, kdim=30, vdim=20, bias=False, add_bias_kv=False, add_zero_attn=False)
+        self.attention_0_6 = nn.MultiheadAttention(embed_dim=40, num_heads=10, kdim=30, vdim=20, bias=True, add_bias_kv=True, add_zero_attn=True)
+
         if version.parse(torch.__version__) >= version.parse('1.9'):
-            self.attention_1_0 = nn.MultiheadAttention(embed_dim=40, num_heads=4, batch_first=True)
-            self.attention_1_1 = nn.MultiheadAttention(embed_dim=40, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True)
-            self.attention_1_2 = nn.MultiheadAttention(embed_dim=40, num_heads=10, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True)
+            self.attention_1_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4, batch_first=True)
+            self.attention_1_1 = nn.MultiheadAttention(embed_dim=64, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True)
+            self.attention_1_2 = nn.MultiheadAttention(embed_dim=64, num_heads=16, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True)
+
             self.attention_1_3 = nn.MultiheadAttention(embed_dim=32, num_heads=8, bias=True, batch_first=True)
 
-    def forward(self, xq, xk, xv, z, yq, yk, yv, w):
+            self.attention_1_4 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20, batch_first=True)
+            self.attention_1_5 = nn.MultiheadAttention(embed_dim=40, num_heads=8, kdim=30, vdim=20, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True)
+            self.attention_1_6 = nn.MultiheadAttention(embed_dim=40, num_heads=10, kdim=30, vdim=20, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True)
+
+    def forward(self, xq, xk, xv, z, yq, yk, yv):
         x0, x0w = self.attention_0_0(xq, xk, xv)
         x1, x1w = self.attention_0_1(xq, xk, xv)
-        x2, x2w = self.attention_0_2(xq, xk, xv)
+        x2, x2w = self.attention_0_2(xq, xk, xk)
+
         x3, _ = self.attention_0_3(z, z, z)
 
+        x4, x4w = self.attention_0_4(yq, yk, yv)
+        x5, x5w = self.attention_0_5(yq, yk, yv)
+        x6, x6w = self.attention_0_6(yq, yk, yv)
+
         if version.parse(torch.__version__) < version.parse('1.9'):
-            return x0, x0w, x1, x1w, x2, x2w, x3
+            return x0, x0w, x1, x1w, x2, x2w, x3, x4, x4w, x5, x5w, x6, x6w
+
+        xq = xq.transpose(0, 1)
+        xk = xk.transpose(0, 1)
+        xv = xv.transpose(0, 1)
+        z = z.transpose(0, 1)
+        yq = yq.transpose(0, 1)
+        yk = yk.transpose(0, 1)
+        yv = yv.transpose(0, 1)
+
+        y0, y0w = self.attention_1_0(xq, xk, xv)
+        y1, y1w = self.attention_1_1(xq, xk, xv)
+        y2, y2w = self.attention_1_2(xq, xk, xk)
 
-        y0, y0w = self.attention_1_0(yq, yk, yv)
-        y1, y1w = self.attention_1_1(yq, yk, yv)
-        y2, y2w = self.attention_1_2(yq, yk, yv)
-        y3, _ = self.attention_1_3(w, w, w)
+        y3, _ = self.attention_1_3(z, z, z)
 
-        return x0, x0w, x1, x1w, x2, x2w, x3, y0, y0w, y1, y1w, y2, y2w, y3
+        y4, y4w = self.attention_1_4(yq, yk, yv)
+        y5, y5w = self.attention_1_5(yq, yk, yv)
+        y6, y6w = self.attention_1_6(yq, yk, yv)
+
+        return x0, x0w, x1, x1w, x2, x2w, x3, x4, x4w, x5, x5w, x6, x6w, y0, y0w, y1, y1w, y2, y2w, y3, y4, y4w, y5, y5w, y6, y6w
 
 def test():
+    torch.set_grad_enabled(False)
+
     net = Model()
     net.eval()
 
@@ -57,24 +87,23 @@ def test():
     xk = torch.rand(20, 1, 64)
     xv = torch.rand(20, 1, 64)
     z = torch.rand(30, 1, 32)
-    yq = torch.rand(1, 15, 40)
-    yk = torch.rand(1, 24, 40)
-    yv = torch.rand(1, 24, 40)
-    w = torch.rand(1, 20, 32)
+    yq = torch.rand(15, 1, 40)
+    yk = torch.rand(24, 1, 30)
+    yv = torch.rand(24, 1, 20)
 
-    a = net(xq, xk, xv, z, yq, yk, yv, w)
+    a = net(xq, xk, xv, z, yq, yk, yv)
 
     # export torchscript
     print(torch.__version__)
     if version.parse(torch.__version__) >= version.parse('1.12.0'):
-        mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv, w), check_trace=False)
+        mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv), check_trace=False)
     else:
-        mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv, w))
+        mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv))
     mod.save("test_nn_MultiheadAttention.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../src/pnnx test_nn_MultiheadAttention.pt inputshape=[20,1,64],[20,1,64],[20,1,64],[30,1,32],[1,15,40],[1,24,40],[1,24,40],[1,20,32]")
+    os.system("../src/pnnx test_nn_MultiheadAttention.pt inputshape=[20,1,64],[20,1,64],[20,1,64],[30,1,32],[15,1,40],[24,1,30],[24,1,20]")
 
     # pnnx inference
     import test_nn_MultiheadAttention_pnnx
diff --git a/tools/pnnx/tests/test_nn_Softmax2d.py b/tools/pnnx/tests/test_nn_Softmax2d.py
new file mode 100644
index 00000000000..e75ce61d252
--- /dev/null
+++ b/tools/pnnx/tests/test_nn_Softmax2d.py
@@ -0,0 +1,56 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softmax2d()
+
+    def forward(self, x):
+        x = self.act_0(x)
+        return x
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 24, 64)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_nn_Softmax2d.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_nn_Softmax2d.pt inputshape=[1,12,24,64]")
+
+    # pnnx inference
+    import test_nn_Softmax2d_pnnx
+    b = test_nn_Softmax2d_pnnx.test_inference()
+
+    return torch.equal(a, b)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_nn_Unfold.py b/tools/pnnx/tests/test_nn_Unfold.py
new file mode 100644
index 00000000000..aece085668c
--- /dev/null
+++ b/tools/pnnx/tests/test_nn_Unfold.py
@@ -0,0 +1,62 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.unfold_0 = nn.Unfold(kernel_size=3)
+        self.unfold_1 = nn.Unfold(kernel_size=(2,4), stride=(2,1), padding=2, dilation=1)
+        self.unfold_2 = nn.Unfold(kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2))
+
+    def forward(self, x):
+        x0 = self.unfold_0(x)
+        x1 = self.unfold_1(x)
+        x2 = self.unfold_2(x)
+
+        return x0, x1, x2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 64, 64)
+
+    a0, a1, a2 = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_nn_Unfold.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_nn_Unfold.pt inputshape=[1,12,64,64]")
+
+    # pnnx inference
+    import test_nn_Unfold_pnnx
+    b0, b1, b2 = test_nn_Unfold_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_pnnx_fuse_adjacent_reshape.py b/tools/pnnx/tests/test_pnnx_fuse_adjacent_reshape.py
new file mode 100644
index 00000000000..8f44987fb5d
--- /dev/null
+++ b/tools/pnnx/tests/test_pnnx_fuse_adjacent_reshape.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.view(1, 1, 8).reshape(2, -1)
+        y = y.reshape(-1, x.size(0)).unsqueeze(1)
+        z = z.unsqueeze(0).unsqueeze(2).view(-1)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(8)
+    y = torch.rand(9, 10)
+    z = torch.rand(8, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_pnnx_fuse_adjacent_reshape.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_pnnx_fuse_adjacent_reshape.pt inputshape=[8],[9,10],[8,9,10]")
+
+    # pnnx inference
+    import test_pnnx_fuse_adjacent_reshape_pnnx
+    b = test_pnnx_fuse_adjacent_reshape_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_pnnx_fuse_pad_conv1d.py b/tools/pnnx/tests/test_pnnx_fuse_pad_conv1d.py
new file mode 100644
index 00000000000..5e1e456f001
--- /dev/null
+++ b/tools/pnnx/tests/test_pnnx_fuse_pad_conv1d.py
@@ -0,0 +1,84 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.pad_0 = nn.ConstantPad1d(2, 0.0)
+        self.pad_1 = nn.ReflectionPad1d(4)
+        self.pad_2 = nn.ReplicationPad1d(3)
+
+        self.conv_0 = nn.Conv1d(in_channels=12, out_channels=14, kernel_size=3)
+        self.conv_1 = nn.Conv1d(in_channels=14, out_channels=14, kernel_size=1)
+        self.conv_2 = nn.Conv1d(in_channels=14, out_channels=14, kernel_size=2)
+        self.conv_3 = nn.Conv1d(in_channels=14, out_channels=12, kernel_size=3, padding=(1,))
+
+    def forward(self, x):
+        x = self.pad_0(x)
+        x = F.pad(x, pad=(1,1))
+        x = self.conv_0(x)
+
+        x = self.pad_1(x)
+        x = self.conv_1(x)
+
+        x = F.pad(x, pad=(3,3), mode='reflect')
+        x = self.conv_1(x)
+
+        x = self.pad_2(x)
+        x = self.conv_2(x)
+
+        x = F.pad(x, pad=(1,1), mode='replicate')
+        x = self.conv_2(x)
+
+        x = F.pad(x, pad=(2,2))
+        x = self.conv_3(x)
+
+        return x
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 13)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_pnnx_pnnx_fuse_pad_conv1d.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_pnnx_pnnx_fuse_pad_conv1d.pt inputshape=[1,12,13]")
+
+    # pnnx inference
+    import test_pnnx_pnnx_fuse_pad_conv1d_pnnx
+    b = test_pnnx_pnnx_fuse_pad_conv1d_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_pnnx_fuse_pad_conv2d.py b/tools/pnnx/tests/test_pnnx_fuse_pad_conv2d.py
new file mode 100644
index 00000000000..23d24100cff
--- /dev/null
+++ b/tools/pnnx/tests/test_pnnx_fuse_pad_conv2d.py
@@ -0,0 +1,86 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.pad_0 = nn.ConstantPad2d(2, 0.0)
+        self.pad_1 = nn.ReflectionPad2d(4)
+        self.pad_2 = nn.ReplicationPad2d(3)
+        self.pad_3 = nn.ZeroPad2d((1,1,0,0))
+
+        self.conv_0 = nn.Conv2d(in_channels=12, out_channels=14, kernel_size=3)
+        self.conv_1 = nn.Conv2d(in_channels=14, out_channels=14, kernel_size=1)
+        self.conv_2 = nn.Conv2d(in_channels=14, out_channels=14, kernel_size=2)
+        self.conv_3 = nn.Conv2d(in_channels=14, out_channels=12, kernel_size=3, padding=(1,1))
+
+    def forward(self, x):
+        x = self.pad_0(x)
+        x = F.pad(x, pad=(1,1))
+        x = self.conv_0(x)
+
+        x = self.pad_1(x)
+        x = self.conv_1(x)
+
+        x = F.pad(x, pad=(3,3,2,2), mode='reflect')
+        x = self.conv_1(x)
+
+        x = self.pad_2(x)
+        x = self.conv_2(x)
+
+        x = F.pad(x, pad=(1,1,1,1), mode='replicate')
+        x = self.conv_2(x)
+
+        x = self.pad_3(x)
+        x = F.pad(x, pad=(2,2,0,0))
+        x = self.conv_3(x)
+
+        return x
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 13, 13)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_pnnx_pnnx_fuse_pad_conv2d.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_pnnx_pnnx_fuse_pad_conv2d.pt inputshape=[1,12,13,13]")
+
+    # pnnx inference
+    import test_pnnx_pnnx_fuse_pad_conv2d_pnnx
+    b = test_pnnx_pnnx_fuse_pad_conv2d_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_abs.py b/tools/pnnx/tests/test_torch_abs.py
new file mode 100644
index 00000000000..9d24e6d3057
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_abs.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.abs(x - 0.5)
+        y = torch.abs(y - 0.5)
+        z = torch.abs(z - 0.5)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_abs.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_abs.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_abs_pnnx
+    b = test_torch_abs_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_acos.py b/tools/pnnx/tests/test_torch_acos.py
new file mode 100644
index 00000000000..7380f753d87
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_acos.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.acos(x)
+        y = torch.acos(y)
+        z = torch.acos(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_acos.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_acos.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_acos_pnnx
+    b = test_torch_acos_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_acosh.py b/tools/pnnx/tests/test_torch_acosh.py
new file mode 100644
index 00000000000..5d572a4ab87
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_acosh.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.acosh(x + 1)
+        y = torch.acosh(y + 1)
+        z = torch.acosh(z + 1)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_acosh.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_acosh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_acosh_pnnx
+    b = test_torch_acosh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_asin.py b/tools/pnnx/tests/test_torch_asin.py
new file mode 100644
index 00000000000..2b4f3cb0542
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_asin.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.asin(x)
+        y = torch.asin(y)
+        z = torch.asin(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_asin.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_asin.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_asin_pnnx
+    b = test_torch_asin_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_asinh.py b/tools/pnnx/tests/test_torch_asinh.py
new file mode 100644
index 00000000000..9f80fbabcf0
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_asinh.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.asinh(x)
+        y = torch.asinh(y)
+        z = torch.asinh(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_asinh.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_asinh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_asinh_pnnx
+    b = test_torch_asinh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_atan.py b/tools/pnnx/tests/test_torch_atan.py
new file mode 100644
index 00000000000..8fd797b918c
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_atan.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.atan(x)
+        y = torch.atan(y)
+        z = torch.atan(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_atan.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_atan.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_atan_pnnx
+    b = test_torch_atan_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_atan2.py b/tools/pnnx/tests/test_torch_atan2.py
new file mode 100644
index 00000000000..27bf5c5deb3
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_atan2.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.atan2(x, y)
+        out1 = torch.atan2(y, y)
+        out2 = torch.atan2(z, torch.ones_like(z) + 0.5)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_atan2.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_atan2.pt inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_atan2_pnnx
+    b = test_torch_atan2_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_atanh.py b/tools/pnnx/tests/test_torch_atanh.py
new file mode 100644
index 00000000000..2cc2ce60d81
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_atanh.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.atanh(x - 0.5)
+        y = torch.atanh(y - 0.5)
+        z = torch.atanh(z - 0.5)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_atanh.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_atanh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_atanh_pnnx
+    b = test_torch_atanh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_bitwise_left_shift.py b/tools/pnnx/tests/test_torch_bitwise_left_shift.py
new file mode 100644
index 00000000000..cc60f144b11
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_bitwise_left_shift.py
@@ -0,0 +1,55 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y):
+        out = torch.bitwise_left_shift(x, y)
+        return out
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.randint(10, (3, 16), dtype=torch.int)
+    y = torch.randint(10, (3, 16), dtype=torch.int)
+
+    a = net(x, y)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_torch_bitwise_left_shift.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_bitwise_left_shift.pt inputshape=[3,16]i32,[3,16]i32")
+
+    # pnnx inference
+    import test_torch_bitwise_left_shift_pnnx
+    b = test_torch_bitwise_left_shift_pnnx.test_inference()
+
+    return torch.equal(a, b)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_bitwise_right_shift.py b/tools/pnnx/tests/test_torch_bitwise_right_shift.py
new file mode 100644
index 00000000000..59d6c9651db
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_bitwise_right_shift.py
@@ -0,0 +1,55 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y):
+        out = torch.bitwise_right_shift(x, y)
+        return out
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.randint(10, (3, 16), dtype=torch.int)
+    y = torch.randint(10, (3, 16), dtype=torch.int)
+
+    a = net(x, y)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_torch_bitwise_right_shift.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_bitwise_right_shift.pt inputshape=[3,16]i32,[3,16]i32")
+
+    # pnnx inference
+    import test_torch_bitwise_right_shift_pnnx
+    b = test_torch_bitwise_right_shift_pnnx.test_inference()
+
+    return torch.equal(a, b)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_ceil.py b/tools/pnnx/tests/test_torch_ceil.py
new file mode 100644
index 00000000000..bd6379a7b3e
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_ceil.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.ceil(x * 10)
+        y = torch.ceil(y * 10)
+        z = torch.ceil(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_ceil.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_ceil.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_ceil_pnnx
+    b = test_torch_ceil_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_cos.py b/tools/pnnx/tests/test_torch_cos.py
new file mode 100644
index 00000000000..9d60eb6613e
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_cos.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.cos(x)
+        y = torch.cos(y)
+        z = torch.cos(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_cos.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_cos.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_cos_pnnx
+    b = test_torch_cos_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_cosh.py b/tools/pnnx/tests/test_torch_cosh.py
new file mode 100644
index 00000000000..7190e7f9e46
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_cosh.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.cosh(x)
+        y = torch.cosh(y)
+        z = torch.cosh(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_cosh.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_cosh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_cosh_pnnx
+    b = test_torch_cosh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_exp.py b/tools/pnnx/tests/test_torch_exp.py
new file mode 100644
index 00000000000..507d96a2209
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_exp.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.exp(x)
+        y = torch.exp(y)
+        z = torch.exp(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_exp.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_exp.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_exp_pnnx
+    b = test_torch_exp_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_floor.py b/tools/pnnx/tests/test_torch_floor.py
new file mode 100644
index 00000000000..04b4cb96c22
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_floor.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.floor(x * 10)
+        y = torch.floor(y * 10)
+        z = torch.floor(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_floor.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_floor.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_floor_pnnx
+    b = test_torch_floor_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_log.py b/tools/pnnx/tests/test_torch_log.py
new file mode 100644
index 00000000000..f98928d0a0d
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_log.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.log(x)
+        y = torch.log(y)
+        z = torch.log(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_log.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_log.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_log_pnnx
+    b = test_torch_log_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_neg.py b/tools/pnnx/tests/test_torch_neg.py
new file mode 100644
index 00000000000..e3424f2176c
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_neg.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.neg(x - 0.5)
+        y = torch.neg(y - 0.5)
+        z = torch.neg(z - 0.5)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_neg.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_neg.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_neg_pnnx
+    b = test_torch_neg_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_pow.py b/tools/pnnx/tests/test_torch_pow.py
new file mode 100644
index 00000000000..85bebce3629
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_pow.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.pow(x, y)
+        out1 = torch.pow(y, y)
+        out2 = torch.pow(z, torch.ones_like(z) + 0.5)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_pow.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_pow.pt inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_pow_pnnx
+    b = test_torch_pow_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_reciprocal.py b/tools/pnnx/tests/test_torch_reciprocal.py
new file mode 100644
index 00000000000..e508929c224
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_reciprocal.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.reciprocal(x)
+        y = torch.reciprocal(y)
+        z = torch.reciprocal(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_reciprocal.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_reciprocal.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_reciprocal_pnnx
+    b = test_torch_reciprocal_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_rsqrt.py b/tools/pnnx/tests/test_torch_rsqrt.py
new file mode 100644
index 00000000000..ec39dae71b5
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_rsqrt.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.rsqrt(x)
+        y = torch.rsqrt(y)
+        z = torch.rsqrt(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_rsqrt.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_rsqrt.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_rsqrt_pnnx
+    b = test_torch_rsqrt_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_sign.py b/tools/pnnx/tests/test_torch_sign.py
new file mode 100644
index 00000000000..b834b7ea15e
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_sign.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.sign(x - 0.5)
+        y = torch.sign(y - 0.5)
+        z = torch.sign(z - 0.5)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_sign.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_sign.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_sign_pnnx
+    b = test_torch_sign_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_sin.py b/tools/pnnx/tests/test_torch_sin.py
new file mode 100644
index 00000000000..b0aa628a371
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_sin.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.sin(x)
+        y = torch.sin(y)
+        z = torch.sin(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_sin.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_sin.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_sin_pnnx
+    b = test_torch_sin_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_sinh.py b/tools/pnnx/tests/test_torch_sinh.py
new file mode 100644
index 00000000000..8f49e78652a
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_sinh.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.sinh(x)
+        y = torch.sinh(y)
+        z = torch.sinh(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_sinh.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_sinh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_sinh_pnnx
+    b = test_torch_sinh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_sqrt.py b/tools/pnnx/tests/test_torch_sqrt.py
new file mode 100644
index 00000000000..6cb88569ec4
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_sqrt.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.sqrt(x)
+        y = torch.sqrt(y)
+        z = torch.sqrt(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_sqrt.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_sqrt.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_sqrt_pnnx
+    b = test_torch_sqrt_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_square.py b/tools/pnnx/tests/test_torch_square.py
new file mode 100644
index 00000000000..65069b0e71d
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_square.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.square(x)
+        y = torch.square(y)
+        z = torch.square(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_square.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_square.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_square_pnnx
+    b = test_torch_square_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_tan.py b/tools/pnnx/tests/test_torch_tan.py
new file mode 100644
index 00000000000..bd3ca4cf863
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_tan.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.tan(x)
+        y = torch.tan(y)
+        z = torch.tan(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_tan.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_tan.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_tan_pnnx
+    b = test_torch_tan_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_tanh.py b/tools/pnnx/tests/test_torch_tanh.py
new file mode 100644
index 00000000000..9157f2daef0
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_tanh.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.tanh(x)
+        y = torch.tanh(y)
+        z = torch.tanh(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_tanh.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_tanh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_tanh_pnnx
+    b = test_torch_tanh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_trunc.py b/tools/pnnx/tests/test_torch_trunc.py
new file mode 100644
index 00000000000..95e82dbb8a1
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_trunc.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.trunc(x * 10 - 5)
+        y = torch.trunc(y * 10 - 5)
+        z = torch.trunc(z * 10 - 5)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_trunc.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_trunc.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_trunc_pnnx
+    b = test_torch_trunc_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)